Regular Expression Patterns

Character Identifiers

\d digit
\w Alphanumeric also includes underscore
\s white space
\D A non digit
\W Non-alphanumeric 
\S Non-whitespace

In [3]:
import re

text = 'My phone numberic is 123-456-7890'

phone = re.search('',text)

In [4]:
phone

<re.Match object; span=(0, 0), match=''>

In [5]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [6]:
phone

<re.Match object; span=(21, 33), match='123-456-7890'>

In [7]:
phone.group()

'123-456-7890'

#quantifiers are used not to write multiple times

+ one or more
{3} occurs exactly 3 times
{2,4} occurs 2 to 4 times
{3,} occurs 3 or more
* occurs zero or more times


In [9]:
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)

In [10]:
phone

<re.Match object; span=(21, 33), match='123-456-7890'>

In [11]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [12]:
results = re.search(phone_pattern,text)

In [14]:
results.group()

'123-456-7890'

In [15]:
results.group(1)

'123'

In [16]:
results.group(2)

'456'

In [17]:
results.group(4)

IndexError: no such group

In [18]:
#Additional Regex Syntax

re.search(r'cat','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [19]:
re.search(r'cat|dog','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [20]:
re.findall(r'at','The cat in the hat sat there.')

['at', 'at', 'at']

In [21]:
re.findall(r'.at','The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [22]:
re.findall(r'...at','The cat in the hat went splat')

['e cat', 'e hat', 'splat']

In [23]:
re.findall(r'^\d','1 is a number')

['1']

In [24]:
re.findall(r'^\d','The 1 is a number')

[]

In [25]:
re.findall(r'\d$','2 is a number')

[]

In [26]:
re.findall(r'\d$','number is 2')

['2']

In [27]:
#exclusion

phrase = 'there are 3 numbers 34 inside 5 this sentence'

In [30]:
pattern = r'[^\d]+'

In [31]:
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [32]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [35]:
clean = re.findall(r'[^!.? ]+',test_phrase)

In [36]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [37]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [38]:
#inclusion

text = 'only find the hypen-words in this sentence. But you do not know how long-ish they are here'

In [41]:
pattern = r'[\w]+-[\w]+'

In [42]:
re.findall(pattern,text)

['hypen-words', 'long-ish']

In [43]:
text = 'Hello, would you like some catfish'
texttwo = 'Hello, would you like to take catnap'
textthree = 'Hello,have you seen this caterpillar?'

In [44]:
re.search(r'cat(fish|nap|erpillar)',textthree)

<re.Match object; span=(25, 36), match='caterpillar'>

Timing the python code

In [47]:
def func_one(n):
    return [str(num) for num in range(n)]

In [48]:
func_one(10)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [49]:
def func_two(n):
    return list(map(str,range(n)))

In [50]:
func_two(10)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [51]:
import time

In [52]:
#current time before
start_time = time.time()
#run code
res = func_one(1000000)
#current time after code run
end_time = time.time()
#elapsed time is the diff
elapsed_time= end_time - start_time

print(elapsed_time)

0.3684520721435547


In [53]:
#current time before
start_time = time.time()
#run code
res = func_two(1000000)
#current time after code run
end_time = time.time()
#elapsed time is the diff
elapsed_time= end_time - start_time

print(elapsed_time)

0.2642960548400879


In [54]:
#timeit module

import timeit

#runs over and over again
stmt = '''
func_one(100) 
'''

#runs once
setup = '''
def func_one(n):
    return [str(num) for num in range(n)]
'''

timeit.timeit(stmt,setup,number = 100000)

2.9783039000003555

In [55]:
stmt2 = '''
func_two(100) 
'''

#runs once
setup2 = '''
def func_two(n):
    return list(map(str,range(n)))
'''

timeit.timeit(stmt2,setup2,number = 100000)

2.2201063999996222

In [56]:
%%timeit
func_one(100)

27.3 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [57]:
%%timeit
func_two(100)

22.2 µs ± 710 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


Ziiping and unzippig files

In [58]:
f = open('fileone.txt','w+')
f.write('one file')
f.close()

In [59]:
f = open('filetwo.txt','w+')
f.write('second file')
f.close()

In [60]:
import zipfile

In [61]:
comp_file = zipfile.ZipFile('comp_file.zip','w')

In [62]:
comp_file.write('fileone.txt',compress_type = zipfile.ZIP_DEFLATED)

In [63]:
comp_file.write('filetwo.txt',compress_type = zipfile.ZIP_DEFLATED)

In [64]:
comp_file.close()

In [65]:
zip_obj = zipfile.ZipFile('comp_file.zip','r')

In [66]:
zip_obj.extractall('extracted_content')

In [67]:
pwd

'C:\\Users\\marth'

In [68]:
import shutil

In [69]:
dir_to_zip = 'C:\\Users\\marth\\extracted_content'

In [70]:
output_file = 'example'

In [71]:
shutil.make_archive(output_file,'zip',dir_to_zip)

'C:\\Users\\marth\\example.zip'

In [72]:
shutil.unpack_archive('example.zip','final_unzip','zip')

puzzle:::