# 使用re库做正则匹配

In [1]:
import re

In [2]:
# match函数 - 从字符串的起始位置开始匹配
# 匹配成功返回一个匹配的对象，否则返回None
# 字符串前的r表示不转义
matched = re.match(r'hello', 'hellopy')
print(matched)

<re.Match object; span=(0, 5), match='hello'>


In [3]:
matched = re.match(r'hello', 'hi hello')
print(matched)

None


In [4]:
matched = re.match(r'[^0-9]+', 'abc012')
print(matched)

<re.Match object; span=(0, 3), match='abc'>


In [5]:
matched = re.match(r'[^0-9]+', '012')
print(matched)

None


In [6]:
# group方法 - 返回匹配的子组
# 子组编号从1开始，从左向右，每次加1
matched = re.match(r'\$([0-9]+)', "$900")
print(matched.group(1))

900


In [7]:
matched = re.match(r'name: ([a-z]+), age: ([0-9]+)', "name: john, age: 25")
print(matched.group(1), matched.group(2))

john 25


In [8]:
# search函数 - 从字符串的任意位置开始匹配
# 匹配成功返回一个匹配对象，否则返回None

matched = re.search(r"[0-9]+\-[0-9]+\-[0-9]+", "2020-01-01")
print(matched)

<re.Match object; span=(0, 10), match='2020-01-01'>


In [9]:
matched = re.search(r"[a-z]+[a-z0-9]*", "a123")
print(matched)

<re.Match object; span=(0, 4), match='a123'>


In [10]:
matched = re.search(r"python", "hello python")
print(matched)

<re.Match object; span=(6, 12), match='python'>


In [11]:
# 贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*)</div>", html)
matched.group(1)

'a</div><div>b'

In [12]:
# 非贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*?)</div>", html)
matched.group(1)

'a'

In [13]:
# match与search的区别
# match只匹配字符串的开始，如果字符串开始不符合正则表达式，则匹配失败，函数返回None
# 而search匹配整个字符串，直到找到一个匹配
line = "Python is the best programming language";
 
matched = re.match( r'best', line)
print(matched)
matched = re.search( r'best', line)
print(matched)

None
<re.Match object; span=(14, 18), match='best'>


# 实战项目 - 见PPT

In [14]:
import requests
from lxml import etree
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}

url = "https://sh.lianjia.com/ershoufang/"
# 发送请求获取该页的HTML
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")

# 开始解析HTML
root = etree.HTML(content)
li_nodes = root.xpath('//ul[@class="sellListContent"]/li[@data-lj_view_evtid]')
for li_node in li_nodes:
    # 获取房源的描述
    title = li_node.xpath('.//div[@class="title"]/a')[0].text
    xiaoqu_nodes = li_node.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a')
    price_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span')
    up_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span')
    
    xiaoqu = xiaoqu_nodes[0].text
    price = float(price_nodes[0].text)
    up_text = up_nodes[0].text
    
    # 正则匹配获取数字部分
    matched = re.search(r'单价(.*)元/平米', up_text)
    up_price = 0
    if matched:
    # 将字符串形式的单价转成浮点数类型
        up_price = float(matched.group(1))
    print("name: {}, xiaoqu: {}, total: {}, unit price: {}".format(title, xiaoqu, price, up_price))

name: 新出房源+不靠马路+满五唯一+近地铁+诚意出售, xiaoqu: 佳龙花园 , total: 970.0, unit price: 80239.0
name: 嘉富小区，经典一室户，一楼带天井，满五年税少, xiaoqu: 嘉富小区 , total: 146.0, unit price: 28132.0
name: 公捷苑小区正气稀que三房，中间楼层，品质高，总价低, xiaoqu: 公捷苑 , total: 316.0, unit price: 31842.0
name: 精装修  业主诚意出售  楼层好  配套齐全 采光好, xiaoqu: 金都花好悦园 , total: 349.0, unit price: 45532.0
name: 精装修+拎包入住+生活配套完善, xiaoqu: 益丰新村 , total: 470.0, unit price: 46397.0
name: 朝南房间采光好，交通便利，业主诚心, xiaoqu: 密山二村 , total: 129.0, unit price: 36287.0
name: 五角场精装带地暖+满五唯一+中楼层+配套齐全交通方便, xiaoqu: 政民路150弄 , total: 273.0, unit price: 61445.0
name: 满五年+小区央座楼层好+简单装修+业主诚售+拎包入住, xiaoqu: 城西小区 , total: 146.0, unit price: 37465.0
name: 新出地铁口双南两房+精装修+第二梯队+位置优越, xiaoqu: 曲阳路491号 , total: 315.0, unit price: 60870.0
name: 经典小户型，开发商统一装修，看房随时, xiaoqu: 维罗纳贵都 , total: 120.0, unit price: 31958.0
name: 新静安一室户，临汾街道，视野宽阔，采光充足, xiaoqu: 闻喜路251弄 , total: 156.0, unit price: 40988.0
name: 店长推荐 高品质小区 开元物业，近体育公园, xiaoqu: 东方花园二期 , total: 719.0, unit price: 71104.0
name: 独栋厨卫全明，四楼，双南户型，面朝花园无遮挡, xiaoqu: 威