In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
'phone' in text

True

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(12, 17), match='phone'>

In [6]:
pattern = 'not in the text'

In [7]:
re.search(pattern,text) # none returned if not found

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern,text)

In [10]:
match

<re.Match object; span=(12, 17), match='phone'>

In [11]:
match.span()

(12, 17)

In [12]:
match.start()

12

In [13]:
match.end()

17

In [14]:
text = 'my phone once, my phone twice'

In [15]:
matches = re.findall('phone',text) # return list of strings

In [16]:
matches

['phone', 'phone']

In [17]:
for match in re.finditer('phone',text): # return all match objects
    print(match)
    print(match.group())

<re.Match object; span=(3, 8), match='phone'>
phone
<re.Match object; span=(18, 23), match='phone'>
phone


In [18]:
text = 'My phone number is 408-555-7777'

In [19]:
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)

In [20]:
phone

<re.Match object; span=(19, 31), match='408-555-7777'>

In [21]:
# compiles together different regular expressions
# parenthesis here indicates grouping
# where each group can be called individually
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [22]:
results = re.search(phone_pattern,text)

In [23]:
results.group()

'408-555-7777'

In [24]:
# start at base 1, returns 1st group
# useful for getting 1st 3 digits for area code
results.group(1)

'408'

In [25]:
results.group(2)

'555'

In [26]:
results.group(3)

'7777'

In [27]:
results.group(4)

IndexError: no such group

In [28]:
# using | operator (or/pipe operator)
re.search(r'cat|dog','The dog is here')

<re.Match object; span=(4, 7), match='dog'>

In [29]:
# . wildcard, anything attached before 'at'
# each . will indicate 1 character
re.findall(r'...at','The cat in the hat went splat..')

['e cat', 'e hat', 'splat']

In [30]:
# ^ starts with the actual phrase itself
re.findall(r'^\d','1 is a number')

['1']

In [31]:
# $ ends with the actual phrase itself
re.findall(r'\d$','That is 42')

['2']

In [32]:
phrase = 'there are 3 numbers 34 inside 5 this sentence'

In [33]:
# [^] exclude digits/numbers
pattern = r'[^\d]+'
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [34]:
test_phrase = 'This is a string! But it has punctuation. How to remove?'

In [35]:
# exclude ! . ? (space)
clean = re.findall(r'[^!.? ]+',test_phrase)
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'to',
 'remove']

In [36]:
' '.join(clean)

'This is a string But it has punctuation How to remove'

In [37]:
t1 = 'Only find the hypen-words in this sentence. ' 
t2 = 'But you do not know how long-ish they are.'
text = t1+t2

In [38]:
# a group of alphanumeric
# + = that ocurs 1 or more times
pattern = r'[\w]+-[\w]+'

In [39]:
re.findall(pattern,text)

['hypen-words', 'long-ish']

In [40]:
text = 'Hello, would you like some catfish?'
texttwo = 'Hello, would you like to take a catnap?'
textthree = 'Hello, have you seen this caterpillar?'

In [41]:
# parenthesis to group other options
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [42]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [43]:
re.search(r'cat(fish|nap|claw)',textthree)