## Regular Expressions

https://docs.python.org/3/library/re.html

In [1]:
# reading a text file
text_file = open("mcdonalds_addresses.txt")
locs = text_file.read()

In [2]:
print(locs)

Urban Eatery 260 Yonge Street Unit J-002, Toronto, ON, M5B 2L9
123 Yonge ST Toronto, ON, M5C 1W4
127 Church Street Toronto, M5C 2G5
5201 Duke Street Halifax, NS, B3J 1N9
5675 Spring Garden Road STE G08
895 Rue De La Gauchetiere Ouest Montreal, QC, H3B 4G1
77 Bank Street Ottawa, ON, K1P 5N2
180 Boul. Maisonneuve Gatineau, QC, J8X 3N5
891 Granville St. Vancouver, BC, V6Z 1K7
UNIT 5 Waterfront Centre 200 Burrard St. Vancouver, BC, V6C 3K1



In [3]:
# get all pincodes
# this is very traditional way, NOT RECOMMENDED. Instead, use Regular Expressions.
for a in range(len(locs)-3):
    chunk = locs[a:a+7]
    if chunk.isupper():
        if chunk[3] == ' ':
            if ',' not in chunk:
                if ' ' not in chunk[0:3]:
                    if ' ' not in chunk[4:]:
                        print(chunk)


M5B 2L9
M5C 1W4
M5C 2G5
B3J 1N9
STE G08
H3B 4G1
K1P 5N2
J8X 3N5
V6Z 1K7
V6C 3K1


In [4]:
# using regular expressions to solve above problem
# regular expressions are used to define search pattern
import re

# pass raw search string
# creating search pattern
regex = re.compile(r'[A-Z0-9]{3} [A-Z0-9]{3}')

# applying the pattern
regex.findall(locs)

['M5B 2L9',
 'M5C 1W4',
 'M5C 2G5',
 'B3J 1N9',
 'STE G08',
 'H3B 4G1',
 'K1P 5N2',
 'J8X 3N5',
 'V6Z 1K7',
 'V6C 3K1']

In [5]:
some_text = """The estimated population of state of California is 39.25 million,
while that of the state of Texas is 27.86 million.
While SanAntonio is in the State of Texas (US),
SanDiego, SanJose and SanFranscisco is in the state of California (US).
Cities in the US generally have numeric pincodes such as 95002 or 48201.
On the other hand, cities in Canada have alpha-numeric zip codes, such as M5B2L9 or Z6Z1K7.
"""

In [6]:
some_text

'The estimated population of state of California is 39.25 million,\nwhile that of the state of Texas is 27.86 million.\nWhile SanAntonio is in the State of Texas (US),\nSanDiego, SanJose and SanFranscisco is in the state of California (US).\nCities in the US generally have numeric pincodes such as 95002 or 48201.\nOn the other hand, cities in Canada have alpha-numeric zip codes, such as M5B2L9 or Z6Z1K7.\n'

In [7]:
reg1 = re.compile(r'million')
reg1.search(some_text)

<_sre.SRE_Match object; span=(57, 64), match='million'>

In [8]:
reg1.search(locs) == None # No match results in None

True

In [9]:
reg1.findall(some_text) # output all search instances in a list

['million', 'million']

#### re - Regular Expression Operations

In [10]:
reg2 = re.compile(r'\w') # match for any letter, numeric digit or underscore
reg2.search(some_text)

<_sre.SRE_Match object; span=(0, 1), match='T'>

In [11]:
reg3 = re.compile(r'\w\w\w') # match for 3 \w occurring simultaneously
reg3.search(some_text)

<_sre.SRE_Match object; span=(0, 3), match='The'>

In [12]:
reg4 = re.compile(r'\w{3}') # same as above, but in short form
reg4.search(some_text)

<_sre.SRE_Match object; span=(0, 3), match='The'>

In [13]:
# It won't match for letter, numeric digit, underscore.
reg5 = re.compile(r'\W{3}') # anything other than \w, then use capital W.
reg5.search(some_text)

<_sre.SRE_Match object; span=(162, 165), match='),\n'>

In [14]:
reg6 = re.compile(r'\d{3}') # match for 3 contigous digits
reg6.search(some_text)

<_sre.SRE_Match object; span=(294, 297), match='950'>

In [15]:
reg7 = re.compile(r'\D{3}')
reg7.search(some_text) # we're not matching for 3 digits here.

<_sre.SRE_Match object; span=(0, 3), match='The'>

In [16]:
reg8 = re.compile(r'San(Antonio|Diego)') # either-or search. In this example, it looks for SanAntonio or SanDiego
reg8.search(some_text)

<_sre.SRE_Match object; span=(123, 133), match='SanAntonio'>

In [17]:
reg8.findall(some_text)

['Antonio', 'Diego']

#### ? - matches the preceding group 0 or 1 times

In [18]:
reg9 = re.compile(r'(ab)?')
reg9.findall("abaaabbabaab")

['ab', '', '', 'ab', '', 'ab', '', 'ab', '']

#### * - matches the preceding group 0 or more times

In [19]:
reg11 = re.compile(r'(ab)*')
reg11.findall("abaaabbabaabab")

['ab', '', '', 'ab', '', 'ab', '', 'ab', '']

In [20]:
reg12 = re.compile(r'(ab){2}')
reg12.findall("abaaabbabaabab") # matches the group exactly 2 times

['ab']

In [21]:
reg13 = re.compile(r'(San)+')
reg13.findall(some_text)

['San', 'San', 'San', 'San']

In [22]:
reg13 = re.compile(r'(San\w+)') # to get the complete word for the match
reg13.findall(some_text)

['SanAntonio', 'SanDiego', 'SanJose', 'SanFranscisco']

#### Customized Character Classes

In [23]:
reg14 = re.compile(r'[aeiou]{2}') # Matches each combination of 2 characters in the list
reg14.findall(some_text)

['io', 'ia', 'io', 'io', 'io', 'ie', 'ia', 'ie', 'ie']

#### Pattern matching in beginning of String

In [24]:
reg15 = re.compile(r'^Hello') # String should start with word 'Hello'
reg15.findall('Hello World!')

['Hello']

In [25]:
reg15.findall('Say Hello to him')

[]

#### Pattern matching in the end of the String

In [26]:
reg16 = re.compile(r'World$') # string should end with the word 'World
reg16.findall('Hello World')

['World']

In [27]:
reg16.findall('The world is flat')

[]

In [28]:
reg17 = re.compile(r'^\d+$') # string should start and end with a digit; should not have any character between
reg17.findall('21')

['21']

In [29]:
reg17.findall('21 years old')

[]

#### . - Wildcard Character

In [30]:
reg18 = re.compile(r'.ing') # match anything ending with ing
reg18.findall('She was laughing, dancing, playing')

['hing', 'cing', 'ying']

#### String Substitution

In [31]:
reg19 = re.compile(r'Street \d+')
reg19.findall('Street 31 is close to Street 22')

['Street 31', 'Street 22']

In [32]:
reg19.sub(r'Street *****', 'Street 31 is close to Street 22')

'Street ***** is close to Street *****'