Regular expressions allow for pattern searching in text documents.

Every character type has a corresponding pattern code.

In [1]:
text = 'The phone number of the agent is 206-559-2212.'

In [2]:
'phone' in text #in function checks to see if somethin is in a string, list, etc. 

True

In [4]:
import re #regular expressions are built into Python

In [5]:
pattern = 'phone'

In [6]:
re.search(pattern, text) #what you want to search and what to search in

<re.Match object; span=(4, 9), match='phone'>

In [7]:
my_match = re.search(pattern, text)

In [9]:
my_match.span() #location of characters that match in the text

(4, 9)

In [10]:
my_match.start() # start of the match

4

In [11]:
my_match.end() # end of the match

9

In [13]:
new_text = 'My phone is a new phone' # pattern repeated in the text

In [14]:
re.search(pattern, new_text) #only finds first instance of the pattern

<re.Match object; span=(3, 8), match='phone'>

In [16]:
all_matches = re.findall(pattern, new_text) #not very helpful, but will find the pattern. pattern can also be 'phone'
all_matches

['phone', 'phone']

In [17]:
len(all_matches)

2

In [19]:
for match in re.finditer(pattern, new_text): #to find the span for each instance of the pattern
    print(match.span())

(3, 8)
(18, 23)


Practice using regular expressions

In [20]:
text

'The phone number of the agent is 206-559-2212.'

In [24]:
pattern2 = r'\d\d\d-\d\d\d-\d\d\d' #3 groups of 3 digits

In [23]:
phone_number = re.search(pattern2,text)
phone_number

<re.Match object; span=(33, 44), match='206-559-221'>

In [25]:
phone_number.group() # just what you are searching on

'206-559-221'

In [27]:
text2 = 'My phone numbers are 206-779-2122 and 270-444-9934.'

In [29]:
phone_number2 = re.search(pattern2,text2)
phone_number2 #only finds the first number

<re.Match object; span=(21, 32), match='206-779-212'>

In [32]:
phone_number2 = re.findall(pattern2, text2)
phone_number2 #returns both numbers

['206-779-212', '270-444-993']

Quantifiers

In [34]:
pattern3 = r'\d{3}-\d{3}-\d{4}'

In [43]:
phone_number2 = re.findall(pattern3,text2)
phone_number2

['206-779-2122', '270-444-9934']

Indexing

In [44]:
pattern4 = r'(\d{3})-(\d{3})-(\d{4})' #Use () to group

In [45]:
phone_number2 = re.search(pattern4,text2)
phone_number2.group(1)

'206'

In [51]:
for match in re.finditer(pattern4, text2): #0 or empty gets the whole phone number, 1 gets the area code, etc.
    print(match.group(2))

779
444


Pipe operator (or) and wildcards

In [52]:
re.search(r'man|woman', "There was a man here.") #pipe acts like an or statement

<re.Match object; span=(12, 15), match='man'>

In [55]:
re.findall('.at', 'The cat in the hat ate a rat that went splat.') #wildcard, did not grab e of ate because wildcard didn't specify, also it is only going one character prior

['cat', 'hat', ' at', 'rat', 'hat', 'lat']

In [56]:
re.findall('..at', 'The cat in the hat ate a rat that went splat.') #2 characters before (may include spaces)

[' cat', ' hat', ' rat', 'that', 'plat']

In [58]:
re.findall('..at.', 'The cat in the hat ate a rat that went splat.') #two spaces before and one after

[' cat ', ' hat ', ' rat ', 'that ', 'plat.']

^ starts with, $ ends with

In [63]:
re.findall(r'\d$', 'This phrase has 2 many of the number 3') #ends with a digit, notice $ is at the end

['3']

In [65]:
re.findall(r'^\d', '1 is the loneliest number.') #starts with a digit, notice ^ is at the beginning

['1']

In [None]:
Excluding things

In [68]:
phrase = "The numbers 4 and 8 repeatedly appear, as does the number 66."

In [69]:
re.findall(r'[\d]', phrase) #finds all the digits

['4', '8', '6', '6']

In [70]:
re.findall(r'[^\d]', phrase) # add ^ to exclude the numbers

['T',
 'h',
 'e',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 ' ',
 'r',
 'e',
 'p',
 'e',
 'a',
 't',
 'e',
 'd',
 'l',
 'y',
 ' ',
 'a',
 'p',
 'p',
 'e',
 'a',
 'r',
 ',',
 ' ',
 'a',
 's',
 ' ',
 'd',
 'o',
 'e',
 's',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 ' ',
 '.']

In [71]:
re.findall(r'[^\d]+', phrase) # add + to get the words reassembled

['The numbers ', ' and ', ' repeatedly appear, as does the number ', '.']

In [72]:
test_phrase = 'First, this is a sentence. Second, I need a hamburger! Agree?'

In [79]:
re.findall(r'[^!.?,]+', test_phrase) #exclude punctuation, commas separate the strings

['First', ' this is a sentence', ' Second', ' I need a hamburger', ' Agree']

In [77]:
sentence = re.findall(r'[^!.?,]+', test_phrase) #now the , is gone

In [78]:
' '.join(sentence)

'First  this is a sentence  Second  I need a hamburger  Agree'

In [86]:
next_text = 'Where are hyphen-words? His anal-retentive boyfriend. I leave at 2-ish.'

In [88]:
re.findall(r'[\w]+-[\w]+', next_text) # find alphanum before and after hyphens

['hyphen-words', 'anal-retentive', '2-ish']