# 使用re库做正则匹配

In [1]:
import re

In [20]:
# match函数 - 从字符串的起始位置开始匹配
# 匹配成功返回一个匹配的对象，否则返回None
# 字符串前的r表示不转义
matched = re.match(r'hello', 'hellopy')
print(matched)
matched = re.match(r'hello', 'hi hello')
print(matched)

matched = re.match(r'[^0-9]+', 'abc012')
print(matched)
matched = re.match(r'[^0-9]+', '012')
print(matched)

<re.Match object; span=(0, 5), match='hello'>
None
<re.Match object; span=(0, 3), match='abc'>
None


In [27]:
# group方法 - 返回匹配的子组
# 子组编号从1开始，从左向右，每次加1
matched = re.match(r'\$([0-9]+)', "$900")
print(matched.group(1))
matched = re.match(r'name: ([a-z]+), age: ([0-9]+)', "name: john, age: 25")
print(matched.group(1), matched.group(2))


900
john 25


In [34]:
# search函数 - 从字符串的任意位置开始匹配
# 匹配成功返回一个匹配对象，否则返回None

matched = re.search(r"[0-9]+\-[0-9]+\-[0-9]+", "2020-01-01")
print(matched)
matched = re.search(r"[a-z]+[a-z0-9]*", "a123")
print(matched)
matched = re.search(r"python", "hello python")
print(matched)

<re.Match object; span=(0, 10), match='2020-01-01'>
<re.Match object; span=(0, 4), match='a123'>
<re.Match object; span=(6, 12), match='python'>


In [38]:
# 贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*)</div>", html)
matched.group(1)

'a</div><div>b'

In [39]:
# 非贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*?)</div>", html)
matched.group(1)

'a'

In [28]:
# match与search的区别
# match只匹配字符串的开始，如果字符串开始不符合正则表达式，则匹配失败，函数返回None
# 而search匹配整个字符串，直到找到一个匹配
line = "Python is the best programming language";
 
matched = re.match( r'best', line)
print(matched)
matched = re.search( r'best', line)
print(matched)

None
<re.Match object; span=(14, 18), match='best'>


# 实战项目 - 见PPT

In [45]:
import requests
from lxml import etree
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}

url = "https://sh.lianjia.com/ershoufang/"
# 发送请求获取该页的HTML
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")

# 开始解析HTML
root = etree.HTML(content)
li_nodes = root.xpath('//ul[@class="sellListContent"]/li[@data-lj_view_evtid]')
for li_node in li_nodes:
    # 获取房源的描述
    title = li_node.xpath('.//div[@class="title"]/a')[0].text
    xiaoqu_nodes = li_node.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a')
    price_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span')
    up_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span')
    
    xiaoqu = xiaoqu_nodes[0].text
    price = float(price_nodes[0].text)
    up_text = up_nodes[0].text
    
    # 正则匹配获取数字部分
    matched = re.search(r'单价(.*)元/平米', up_text)
    up_price = 0
    if matched:
    # 将字符串形式的单价转成浮点数类型
        up_price = float(matched.group(1))
    print("name: {}, xiaoqu: {}, total: {}, unit price: {}".format(title, xiaoqu, price, up_price))

name: 佳宝品质三房，中间楼层3楼，无增值税，近9号地铁, xiaoqu: 佳宝新村 , total: 399.0, unit price: 49260.0
name: （7号线）（中区）（满五年）（南北通）（采光佳）, xiaoqu: 经纬观澜弘郡家园(三期) , total: 453.0, unit price: 53699.0
name: 近地铁2016年次新房，开发商精装小两房，中楼采光好, xiaoqu: 远东君悦庭 , total: 403.8, unit price: 57059.0
name: 绿梅一村 2室1厅 225万, xiaoqu: 绿梅一村 , total: 225.0, unit price: 55832.0
name: 四期景观大一房，业主置换跟我买好房子，有钥匙随时看, xiaoqu: 中远两湾城 , total: 535.0, unit price: 76913.0
name: 正气三房 业主置换 诚意出售  满五唯一 产权清晰, xiaoqu: 东方知音苑 , total: 992.0, unit price: 81984.0
name: 财富海景前排中区大宅，景色无遮挡，境外业主诚意出售, xiaoqu: 财富海景花园 , total: 4500.0, unit price: 132361.0
name: 新出高区，全南三房，阳光好，产权清晰，可随时签约, xiaoqu: 菊园 , total: 1150.0, unit price: 84777.0
name: 美丽景观大三房 三房朝南带阳台 佰平阔气客厅！, xiaoqu: 翠湖天地隽荟 , total: 4858.0, unit price: 163950.0
name: 自住装修，一线江景。满五唯一，看房方便, xiaoqu: 家化滨江苑 , total: 1540.0, unit price: 70906.0
name: 两房满五唯一，业主诚意出售，拎包入住，, xiaoqu: 绿地玲珑寓 , total: 120.0, unit price: 16566.0
name: 人车分流，园林小区 满五唯一 税费低 高区采光好, xiaoqu: 新家坡园景苑 , total: 765.0, unit price: 80612.0
name: 南北通透 板式房型 闹中取静 