# 使用re库做正则匹配

In [None]:
import re

In [None]:
# match函数 - 从字符串的起始位置开始匹配
# 匹配成功返回一个匹配的对象，否则返回None
# 字符串前的r表示不转义
matched = re.match(r'hello', 'hellopy')
print(matched)
matched = re.match(r'hello', 'hi hello')
print(matched)

matched = re.match(r'[^0-9]+', 'abc012')
print(matched)
matched = re.match(r'[^0-9]+', '012')
print(matched)

In [None]:
# group方法 - 返回匹配的子组
# 子组编号从1开始，从左向右，每次加1
matched = re.match(r'\$([0-9]+)', "$900")
print(matched.group(1))
matched = re.match(r'name: ([a-z]+), age: ([0-9]+)', "name: john, age: 25")
print(matched.group(1), matched.group(2))


In [None]:
# search函数 - 从字符串的任意位置开始匹配
# 匹配成功返回一个匹配对象，否则返回None

matched = re.search(r"[0-9]+\-[0-9]+\-[0-9]+", "2020-01-01")
print(matched)
matched = re.search(r"[a-z]+[a-z0-9]*", "a123")
print(matched)
matched = re.search(r"python", "hello python")
print(matched)

In [None]:
# 贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*)</div>", html)
matched.group(1)

In [None]:
# 非贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*?)</div>", html)
matched.group(1)

In [None]:
# match与search的区别
# match只匹配字符串的开始，如果字符串开始不符合正则表达式，则匹配失败，函数返回None
# 而search匹配整个字符串，直到找到一个匹配
line = "Python is the best programming language";
 
matched = re.match( r'best', line)
print(matched)
matched = re.search( r'best', line)
print(matched)

# 实战项目 - 见PPT

In [None]:
import requests
from lxml import etree
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}

url = "https://sh.lianjia.com/ershoufang/"
# 发送请求获取该页的HTML
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")

# 开始解析HTML
root = etree.HTML(content)
li_nodes = root.xpath('//ul[@class="sellListContent"]/li[@data-lj_view_evtid]')
for li_node in li_nodes:
    # 获取房源的描述
    title = li_node.xpath('.//div[@class="title"]/a')[0].text
    xiaoqu_nodes = li_node.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a')
    price_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span')
    up_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span')
    
    xiaoqu = xiaoqu_nodes[0].text
    price = float(price_nodes[0].text)
    up_text = up_nodes[0].text
    
    # 正则匹配获取数字部分
    matched = re.search(r'单价(.*)元/平米', up_text)
    up_price = 0
    if matched:
    # 将字符串形式的单价转成浮点数类型
        up_price = float(matched.group(1))
    print("name: {}, xiaoqu: {}, total: {}, unit price: {}".format(title, xiaoqu, price, up_price))