# 正则表达式

### 常规匹配  

In [2]:
import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))

result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())
print(result.span())

41
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


### 泛匹配

In [4]:
import re

content = 'Hello 123 4567 World_This is a Regex Demo'

result = re.match('^Hello.*Demo$',content)
print(result)
print(result.group())
print(result.span())

<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


### 匹配目标

In [6]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'

result = re.match('^Hello\s(\d+)\sWorld.*Demo$',content)
print(result)
# 存在()时,可以根据group制定查找
print(result.group(1))  
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)


### 贪婪匹配

In [9]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'

result = re.match('^He.*(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
7
(0, 40)


## 非贪婪匹配

In [10]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'

result = re.match('^He.*?(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)


### 匹配模式

.  无法匹配换行符

In [13]:
import re

content = '''Hello 1234567 World_This
is a Regex Demo'''

result = re.match('^He.*?(\d+).*?Demo$',content)
print(result)


None


re.S可以使.匹配换行符

In [14]:
import re

content = '''Hello 1234567 World_This
is a Regex Demo'''

result = re.match('^He.*?(\d+).*?Demo$',content,re.S)
print(result.group(1))  


1234567


### 转义

In [15]:
import re

content = 'price is $5.00'
result = re.match('price is $5.00',content)
print(result)

None


In [16]:
import re

content = 'price is $5.00'
result = re.match('price is \$5\.00',content)
print(result)

<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>


总结:
尽量使用**泛匹配**,使用括号得到匹配目标,尽量使用**非贪婪模式**,有**换行符使用re.S**

### re.search

re.search扫描整个字符串返回第一个匹配

match 如果开头匹配不上则匹配失败

In [17]:
import re

content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
result = re.match('Hello.*?(\d+).*?Demo',content)
print(result)

None


In [18]:
import re

content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
result = re.search('Hello.*?(\d+).*?Demo',content)
print(result)

<_sre.SRE_Match object; span=(13, 53), match='Hello 1234567 World_This is a Regex Demo'>


总结:为匹配方便,能用search就不用match

### re.findall

找到所有匹配

### re.sub

替换字符串中每个匹配的字串后返回替换后的字符串

In [19]:
import re

content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
content = re.sub('\d+','',content)
print(content)

Extra stings Hello  World_This is a Regex Demo Extra stings


In [20]:
import re

content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
content = re.sub('\d+','Replacement',content)
print(content)

Extra stings Hello Replacement World_This is a Regex Demo Extra stings


### re.compile

将正则表达式编译成正则对象,以便复用

In [21]:
import re

content = '''Hello 1234567 World_This
is a Regex Demo'''
pattern = re.compile('Hello.*Demo',re.S)
result = re.match(pattern,content)

print(result)

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>


# 实战练习_爬取豆瓣读书页面的

In [None]:
import requests
import re

# 爬取页面源代码
content = requests.get('https://book.douban.com/').text
#print(content)

pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)

results = re.findall(pattern, content)
# print(results)

for result in results:
    url, title, author, date = result
    author = re.sub('\s','',author)
    date = re.sub('\s','',date)
    print(url, title, author, date)