In [1]:
import re

In [17]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
pat
mat
bat
'''

In [3]:
pattern = re.compile(r'abc')  # SYNTAX searches 'abc'
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(1, 4), match='abc'>


In [4]:
text_to_search[1:4]

'abc'

In [5]:
pattern = re.compile(r'\.')  # SYNTAX  searches '.'
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(111, 112), match='.'>
<re.Match object; span=(146, 147), match='.'>
<re.Match object; span=(167, 168), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(218, 219), match='.'>
<re.Match object; span=(249, 250), match='.'>
<re.Match object; span=(262, 263), match='.'>


## _____-----____------____------___------

- .       - Any Character Except New Line
- \d      - Digit (0-9)
- \D      - Not a Digit (0-9)
- \w      - Word Character (a-z, A-Z, 0-9, _)
- \W      - Not a Word Character
- \s      - Whitespace (space, tab, newline)
- \S      - Not Whitespace (space, tab, newline)

<br>

- \b      - Word Boundary
- \B      - Not a Word Boundary
- ^       - Beginning of a String
- $       - End of a String

<br>

- []      - Matches Characters in brackets
- [^ ]    - Matches Characters NOT in brackets
- |       - Either Or
- ( )     - Group
<br>

#### Quantifiers:
- *   - 0 or More
- +       - 1 or More
- ?       - 0 or One
- {3}     - Exact Number
- {3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

In [6]:
pattern = re.compile(r'\bHa')  # SYNTAX  searches '.'
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(66, 68), match='Ha'>
<re.Match object; span=(69, 71), match='Ha'>


In [7]:
sentence = 'Start a sentence and then bring it to an end'

In [8]:
pattern = re.compile(r'^Start')  # searches 'Start' at the start of string
for i in pattern.finditer(sentence):
    print(i)

<re.Match object; span=(0, 5), match='Start'>


In [9]:
pattern = re.compile(r'end$')  # searches 'end' at the end of string
for i in pattern.finditer(sentence):
    print(i)

<re.Match object; span=(41, 44), match='end'>


In [10]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')  # '.' matches any character
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [13]:
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')  # '.' matches any character
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [14]:
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')  # '[89]' matches one character 8 or 9
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [None]:
pattern = re.compile(r'[^a-zA-Z]')  # '[^a-zA-Z]' matches one character which is not a-z or A-Z
for i in pattern.finditer(text_to_search):
    print(i)

In [None]:
pattern = re.compile(r'[^b]at]')  # '[^b]at]' matches 'Xat' where X is not b
for i in pattern.finditer(text_to_search):
    print(i)

In [22]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')  # '\d{3}.\d{3}.\d{4}' matches '\d\d\d.\d\d\d.\d\d\d\d' (written using quantifiers)
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [None]:
pattern = re.compile(r'Mr\.?\s')  # 'Mr\.?\s' matches Mr. where ? says \. is eaither not occuring(0) or occuring ones(1)
for i in pattern.finditer(text_to_search):
    print(i)

In [None]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')  # 'Mr\.?\s[A-Z]\w*' matches Mr. where ? says \. is either not occuring(0) or occuring ones(1) & * says either there are none word characters (0) after [A-Z] or there are More
for i in pattern.finditer(text_to_search):
    print(i)

In [23]:
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')  # '(r|s|rs)' matches either w/ r or s or rs
for i in pattern.finditer(text_to_search):
    print(i)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [24]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')  # same as above
    print(i)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [15]:
with open('data.txt', 'r') as f:
    contents = f.read()

In [12]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
for i in pattern.finditer(contents):
    print(i)

<re.Match object; span=(12, 24), match='615-555-7164'>
<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(191, 203), match='560-555-5153'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(378, 390), match='714-555-7405'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(557, 569), match='783-555-4799'>
<re.Match object; span=(647, 659), match='516-555-4615'>
<re.Match object; span=(740, 752), match='127-555-1867'>
<re.Match object; span=(831, 843), match='608-555-4938'>
<re.Match object; span=(917, 929), match='568-555-6051'>
<re.Match object; span=(1005, 1017), match='292-555-1875'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1182, 1194), match='614-555-1166'>
<re.Match object; span=(1273, 1285), match='530-555-2676'>
<re.Match object; span=(1359, 1371), match='470-555-2750'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; spa

In [16]:
for i in pattern.finditer(contents):
    print(i)

<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; span=(1794, 1806), match='800-555-7100'>
<re.Match object; span=(2055, 2067), match='900-555-5118'>
<re.Match object; span=(2830, 2842), match='900-555-5428'>
<re.Match object; span=(3290, 3302), match='800-555-8810'>
<re.Match object; span=(3977, 3989), match='900-555-9598'>
<re.Match object; span=(4951, 4963), match='800-555-2420'>
<re.Match object; span=(5572, 5584), match='900-555-3567'>
<re.Match object; span=(6195, 6207), match='800-555-3216'>
<re.Match object; span=(6897, 6909), match='900-555-7755'>
<re.Match object; span=(7872, 7884), match='800-555-1372'>
<re.Match object; span=(8751, 8763), match='900-555-6426'>


In [25]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

In [26]:
pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.com')
pattern.findall(emails)

['CoreyMSchafer@gmail.com']

In [43]:
pattern = re.compile(r'[a-zA-Z0-9-]+@[a-zA-Z-]+\.[a-zA-Z]+')
pattern.findall(emails)

['CoreyMSchafer@gmail.com',
 'schafer@university.edu',
 'corey-321-schafer@my-work.net']

In [52]:
pattern = re.compile(r'[a-zA-Z0-9-]+@[a-zA-Z-]+\.(com|edu|net)')
for i in pattern.finditer(emails):
    print(i)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(31, 53), match='schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [51]:
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+(.com|.edu)')
for i in pattern.finditer(emails):
    print(i)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>


## Substitution

In [63]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [64]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  # s? implies s is optional
for i in pattern.finditer(urls):
    print(i)                                          # (www.)? implies www. is optional

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [72]:
for i in pattern.finditer(urls):
    print(i.group(0)) 
    
print('\n')

for i in pattern.finditer(urls):
    print(i.group(1)) 
    
print('\n')   

for i in pattern.finditer(urls):
    print(i.group(2)) 
    
print('\n')

for i in pattern.finditer(urls):
    print(i.group(3)) 

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


www.
None
None
www.


google
coreyms
youtube
nasa


.com
.com
.com
.gov


In [66]:
print(pattern.sub(r'\2\3', urls))


google.com
coreyms.com
youtube.com
nasa.gov

