# REGEX

- Regular expression matching
- Cheat sheet: https://cheatography.com/mutanclan/cheat-sheets/python-regular-expression-regex/

In [1]:
import re
import pandas as pd

### Search and Match
https://docs.python.org/3/library/re.html#search-vs-match

In [2]:
re.match("c", "abcdef")

In [3]:
re.search("c", "abcdef")

<re.Match object; span=(2, 3), match='c'>

### Matching objects within a string:

In [4]:
addr = '7322 Silverhorn Drive, Evergreen, Colorado, 80439'

##### get all the numbers out of the address using regex

In [5]:
re.findall('[0-9]+', addr)
re.findall('\d+', addr)

['7322', '80439']

##### get all the capital letters out of the address using regex

In [6]:
re.findall('[A-Z]', addr)

['S', 'D', 'E', 'C']

##### get all the lower case letters out of the address using regex

In [7]:
re.findall('[a-z]', addr)

['i',
 'l',
 'v',
 'e',
 'r',
 'h',
 'o',
 'r',
 'n',
 'r',
 'i',
 'v',
 'e',
 'v',
 'e',
 'r',
 'g',
 'r',
 'e',
 'e',
 'n',
 'o',
 'l',
 'o',
 'r',
 'a',
 'd',
 'o']

##### find the town using regex

In [3]:
addr = '7322 Silverhorn Drive, Evergreen, Colorado, 80439'
pattern = '\d+\s.+,\s.+,\s.+,\s\d+'
capture = '(\d+)\s(.+),\s(.+),\s(.+),\s(\d+)'
re.findall(pattern, addr)
re.search(capture, addr)

<re.Match object; span=(0, 49), match='7322 Silverhorn Drive, Evergreen, Colorado, 80439>

In [4]:
re.search(capture, addr).group(0)

'7322 Silverhorn Drive, Evergreen, Colorado, 80439'

In [10]:
re.search(capture, addr).group(1)

'7322'

In [11]:
re.search(capture, addr).group(2)

'Silverhorn Drive'

In [12]:
re.search(capture, addr).group(3)

'Evergreen'

In [13]:
re.search(capture, addr).group(4)

'Colorado'

In [14]:
re.search(capture, addr).group(5)

'80439'

##### Turn a list of addressess into a dataframe using regex

In [15]:
addresses = ['7322 Silverhorn Drive, Evergreen, Colorado, 80439',
        '13400 Over There Street, Fort Collins, Colorado, 80218',
        '1560 Market Street, Denver, Colorado, 80202',
        '7413 South Parfet Court, Boulder, Colorado, 80303']

In [16]:
street_number = []
street_name = []
city = []
state = []
zipcode = []

for addr in addresses:
    
    capture = '(\d+)\s(.+),\s(.+),\s(.+),\s(\d+)'
    matches = re.search(capture, addr)
    
    street_number.append(matches.group(1))
    street_name.append(matches.group(2))
    city.append(matches.group(3))
    state.append(matches.group(4))
    zipcode.append(matches.group(5))
    
d = {'street_number': street_number
    ,'street_name': street_name
    ,'city': city
    ,'state': state
    ,'zipcode': zipcode}

pd.DataFrame(d)    

Unnamed: 0,street_number,street_name,city,state,zipcode
0,7322,Silverhorn Drive,Evergreen,Colorado,80439
1,13400,Over There Street,Fort Collins,Colorado,80218
2,1560,Market Street,Denver,Colorado,80202
3,7413,South Parfet Court,Boulder,Colorado,80303


### Making a phonebook and `re.split`

In [17]:
text = """Ross McFluff: 834.345.1254 155 Elm Street
Ronald Heathmore: 892.345.3428 436 Finley Avenue
Frank Burger: 925.541.7625 662 South Dogwood Way
Heather Albrecht: 548.326.4584 919 Park Place"""

text

'Ross McFluff: 834.345.1254 155 Elm Street\nRonald Heathmore: 892.345.3428 436 Finley Avenue\nFrank Burger: 925.541.7625 662 South Dogwood Way\nHeather Albrecht: 548.326.4584 919 Park Place'

In [18]:
to_list = re.split("\n", text)
to_list

['Ross McFluff: 834.345.1254 155 Elm Street',
 'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
 'Frank Burger: 925.541.7625 662 South Dogwood Way',
 'Heather Albrecht: 548.326.4584 919 Park Place']

In [19]:
test = to_list[0]
test

'Ross McFluff: 834.345.1254 155 Elm Street'

In [20]:
re.split(':\s|\s', test, 3)

['Ross', 'McFluff', '834.345.1254', '155 Elm Street']

In [21]:
addrs = [re.split(":\s|\s", addr, 3) for addr in to_list]
addrs

[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]

In [22]:
pd.DataFrame(addrs, columns = ['fName', 'lName', 'Phone', 'Address']) 

Unnamed: 0,fName,lName,Phone,Address
0,Ross,McFluff,834.345.1254,155 Elm Street
1,Ronald,Heathmore,892.345.3428,436 Finley Avenue
2,Frank,Burger,925.541.7625,662 South Dogwood Way
3,Heather,Albrecht,548.326.4584,919 Park Place
