# 使用re库做正则匹配

In [1]:
import re

In [2]:
# match函数 - 从字符串的起始位置开始匹配
# 匹配成功返回一个匹配的对象，否则返回None
# 字符串前的r表示不转义
matched = re.match(r'hello', 'hellopy')
print(matched)

<_sre.SRE_Match object; span=(0, 5), match='hello'>


In [3]:
matched = re.match(r'hello', 'hi hello')
print(matched)

None


In [4]:
matched = re.match(r'[^0-9]+', 'abc012')
print(matched)

<_sre.SRE_Match object; span=(0, 3), match='abc'>


In [5]:
matched = re.match(r'[^0-9]+', '012')
print(matched)

None


In [6]:
# group方法 - 返回匹配的子组
# 子组编号从1开始，从左向右，每次加1
matched = re.match(r'\$([0-9]+)', "$900")
print(matched.group(1))

900


In [7]:
matched = re.match(r'name: ([a-z]+), age: ([0-9]+)', "name: john, age: 25")
print(matched.group(1), matched.group(2))

john 25


In [8]:
# search函数 - 从字符串的任意位置开始匹配
# 匹配成功返回一个匹配对象，否则返回None

matched = re.search(r"[0-9]+\-[0-9]+\-[0-9]+", "2020-01-01")
print(matched)

<_sre.SRE_Match object; span=(0, 10), match='2020-01-01'>


In [9]:
matched = re.search(r"[a-z]+[a-z0-9]*", "a123")
print(matched)

<_sre.SRE_Match object; span=(0, 4), match='a123'>


In [10]:
matched = re.search(r"python", "hello python")
print(matched)

<_sre.SRE_Match object; span=(6, 12), match='python'>


In [11]:
# 贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*)</div>", html)
matched.group(1)

'a</div><div>b'

In [12]:
# 非贪婪匹配
html = "<div>a</div><div>b</div>"
matched = re.search(r"<div>(.*?)</div>", html)
matched.group(1)

'a'

In [13]:
# match与search的区别
# match只匹配字符串的开始，如果字符串开始不符合正则表达式，则匹配失败，函数返回None
# 而search匹配整个字符串，直到找到一个匹配
line = "Python is the best programming language";
 
matched = re.match( r'best', line)
print(matched)
matched = re.search( r'best', line)
print(matched)

None
<_sre.SRE_Match object; span=(14, 18), match='best'>


# 实战项目 - 见PPT

In [14]:
import requests
from lxml import etree
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}

url = "https://sh.lianjia.com/ershoufang/"
# 发送请求获取该页的HTML
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")

# 开始解析HTML
root = etree.HTML(content)
li_nodes = root.xpath('//ul[@class="sellListContent"]/li[@data-lj_view_evtid]')
for li_node in li_nodes:
    # 获取房源的描述
    title = li_node.xpath('.//div[@class="title"]/a')[0].text
    xiaoqu_nodes = li_node.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a')
    price_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span')
    up_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span')
    
    xiaoqu = xiaoqu_nodes[0].text
    price = float(price_nodes[0].text)
    up_text = up_nodes[0].text
    
    # 正则匹配获取数字部分
    matched = re.search(r'单价(.*)元/平米', up_text)
    up_price = 0
    if matched:
    # 将字符串形式的单价转成浮点数类型
        up_price = float(matched.group(1))
    print("name: {}, xiaoqu: {}, total: {}, unit price: {}".format(title, xiaoqu, price, up_price))

name: 业主诚心出售，有钥匙，看房方便。, xiaoqu: 飞虹北村 , total: 105.0, unit price: 11291.0
name: 国家会展 徐泾地铁站诸光路地铁站小区环境好有钥匙, xiaoqu: 玉兰清苑 , total: 300.0, unit price: 43479.0
name: 花园位置，1梯2户，不靠高架，看房方便，93年样板小区, xiaoqu: 康强坊 , total: 368.0, unit price: 61181.0
name: 沪亭北路精装大两房 满五年 前后无遮挡 随签随看, xiaoqu: 知雅汇(公寓) , total: 340.0, unit price: 34854.0
name: 动迁房 精装修  中间位置 采光无遮挡 税费少, xiaoqu: 绿地新江桥城三期 , total: 318.0, unit price: 41466.0
name: 沿地铁南北通房源，小区环境好房龄新，业主诚意出售, xiaoqu: 金硕河畔景园西区 , total: 228.0, unit price: 31672.0
name: 一梯两户，双南两房，楼层好，位置好。, xiaoqu: 龙南六村 , total: 403.0, unit price: 57953.0
name: 房东底价.五一前有效。采光好.满五唯一.没有个税, xiaoqu: 东方花园(一期) , total: 558.0, unit price: 51325.0
name: 动迁满五税费少  中高楼层，全天采光 视野开阔, xiaoqu: 金沙鼎苑 , total: 295.0, unit price: 38862.0
name: 小区极少房源+户型方正+厅南北通全明+附带80平双露台, xiaoqu: 上海豪园 , total: 738.0, unit price: 47854.0
name: 店长推荐 高品质小区 开元物业，近体育公园, xiaoqu: 东方花园二期 , total: 719.0, unit price: 71104.0
name: 格局好，南北通户型，装修保养得好, xiaoqu: 中虹华苑 , total: 307.0, unit price: 42021.0
name: 底楼复式带花园  大天井采光好  满五唯一税费少装修好