In [2]:
import re 

# Lesson from https://regexone.com/

"""
Syntax: 

abc… Letters
123… Digits
\d Any Digit
\D Any Non-digit character
. Any Character
\. Period
[abc] Only a, b, or c
[^abc] Not a, b, nor c
[a-z] Characters a to z
[0-9] Numbers 0 to 9
\w Any Alphanumeric character
\W Any Non-alphanumeric character
{m} m Repetitions
{m,n} m to n Repetitions
* Zero or more repetitions
+ One or more repetitions
? Optional character
\s Any Whitespace
\S Any Non-whitespace character
^…$ Starts and ends
(…) Capture Group
(a(bc)) Capture Sub-group
(.*) Capture all
(abc|def) Matches abc or def
"""


In [31]:
# numbers 0-9

txt = "123abc"
x = re.findall("\d", txt) 
x

['1', '2', '3']

In [32]:
# wild card 

txt = "123abc."
x = re.findall(".", txt)
print(x)
# to escape the dot and look for dots on only use \. 
x = re.findall(".", txt) 

x

['1', '2', '3', 'a', 'b', 'c', '.']


['1', '2', '3', 'a', 'b', 'c', '.']

In [33]:
# specific patterns. put them in brackets 
# pattern [abc] will only match a single a, b, or c letter and nothing else

txt = "can"
x = re.findall("[cmf]", txt)
txt = "pan"
print(x)
x = re.findall("[cmf]", txt)
print(x)


['c']
[]


In [34]:
# exclude specific characters, use ^
# pattern [^abc] will match any single character except for the letters a, b, or c.
txt = "can"
x = re.findall("[^cmf]", txt)
print(x)
txt = "cmf"
x = re.findall("[^cmf]", txt)
print(x)



['a', 'n']
[]


In [35]:
# Character ranges 
#  For example, the pattern [0-6] will only match any single digit character from zero to six, and nothing else. 
#  [^n-p] will only match any single character except for letters n to p.
# using the dash to indicate a character range
#  alphanumeric \w metacharacter which is equivalent to the character range [A-Za-z0-9_]

txt = "Aan"
x = re.findall("[A-Z]", txt)
print(x)
txt = "cmf"
x = re.findall("[A-Z]", txt)
print(x)

txt = "cmf"
x = re.findall("[^A-Z]", txt) # exlucde uppercase 
print(x)

['A']
[]
['c', 'm', 'f']


In [6]:
# repetition 
#  a{3} will match the a character exactly three times
# specify a range for this repetition such that a{1,3} 
#  [wxy]{5} (five characters, each of which can be a w, x, or y) and .{2,6} (between two and six of any character).


txt = "wazzzup"
x = re.findall("z{3}", txt)
print(x)
txt = "wazzzup"
x = re.findall("[zu]{4}", txt)
print(x)
txt = "wazzzup12411334"
x = re.findall("[\d]{4}", txt)
print(x)

['zzz']
['zzzu']
['1241', '1334']


In [52]:
# Kleene Star
# 0 or more or 1 or more of the character that it follows (it always follows a character or group).
# pattern \d* to match any number of digits
# \d+ which ensures that the input string has at least one digit
# example a+ (one or more a's), [abc]+ (one or more of any a, b, or c character) and 
# .* (zero or more of any character).

txt = "aaaabcc"
x = re.findall("a", txt)[0]
print(x)
x = re.findall("a+", txt)[0]
print(x)
x = re.findall("a+.c+", txt)[0]
print(x)


a
aaaa
aaaabcc


In [62]:
# optional Characters  
# the pattern ab?c will match either the strings "abc" or "ac" because the b is considered optional.
# escape it using a slash \? 

txt = "14 file found?"
x = re.findall("[0-9]+\sfiles?", txt)[0]
print(x)

14 file


In [68]:
# whitespace
# space (␣), the tab (\t), the new line (\n) and the carriage return (\r) 
# whitespace special character \s will match any of the specific whitespaces above 

txt = "14 file found?"
x = re.findall("[0-9]+\sfiles?", txt)[0]
print(x)

txt = "14file found?"
x = re.findall("[0-9]+files?", txt)[0]
print(x)

txt = "3.           abc"
x = re.findall("\s+", txt)[0]
len(x)
txt = "3.           abc"
x = re.findall("\s", txt)[0]
x

14 file
14file


' '

In [74]:
# Starting and ending 
# describes both the start and the end of the line using the special ^ (hat) and $ (dollar sign) metacharacter
# ^success to match only a line that begins with the word "success"
# Note that this is different than the hat used inside a set of bracket [^...] for excluding characters, 
# which can be confusing when reading regular expressio
# for start and end of a line 

txt = "Mission"
x = re.findall("^Mission$", txt)[0]
print(x)
txt = "Mission h"
x = re.findall("^Mission$", txt)
print(x)

txt = "Mission h"
x = re.findall("^Mission", txt)[0]
print(x)

Mission
[]
Mission


In [76]:
# Match groups   Lesson 11
# defining groups of characters and capturing them using the special parentheses ( and ) metacharacters
# pattern such as ^(IMG\d+\.png)$ to capture and extract the full filename
# ^(IMG\d+)\.png$ which only captures the part before the period

txt = "file_record_transcript.pdf"
x = re.findall("^(file_.*)\.pdf$", txt)[0]
print(x)
txt = "file_record_transcript.pdf"
x = re.findall("^(file_.*\.pdf)$", txt)[0] # keep extension 
print(x)

file_record_transcript
file_record_transcript.pdf


In [13]:
"""
Nested groups 

 extract filename and the picture number using the same pattern by writing an expression like ^(IMG(\d+))\.png$ 
 (using a nested parenthesis to capture the digits).
"""

txt = "Jan 1987"
x = re.findall("^(\w\w\w\s(\d+))$", txt)
print(x) 

# group 1 = (\w\w\w\s(\d+))
# group 2 = (\d+)

txt = "Jan 1987"
x = re.findall("^(\w\w\w)\s(\d+)$", txt)
print(x) 

txt = "Jan 1987"
x = re.findall("^(\w\w\w\s\d+)$", txt)
print(x) 

txt = "1280x720"
x = re.findall("^(\d+)x(\d+)$", txt)
print(x) 

[('Jan 1987', '1987')]
[('Jan', '1987')]
['Jan 1987']
[('1280', '720')]


In [25]:
"""
Conditionals 
"""

txt = "I love cats"
x = re.findall("^(I\slove\s(cats|dogs))$", txt)
print(x) 

txt = "I love dogs"

x = re.findall("^I\slove\s(cats|dogs)$", txt)
print(x) 

[('I love cats', 'cats')]
['dogs']


In [34]:
"""
Problem 1 matching decimal numbers 

"""

txt = """3.14529\n
-255.34\n
128\n
1.9e10\n
123,340.00\n
720p\n
"""
print(txt.split("\n"))

x = list(map(lambda x : re.findall("^-?\d+,?\.?\d+e?\.?\d+$", x), txt.split("\n")))

print(x)

# their solution 
x = list(map(lambda x : re.findall("^-?\d+(,\d+)*(\.\d+(e\d+)?)?$", x), txt.split("\n")))

print(x)

"""
^-?\d+,?\.?\d+e?\.?\d+$

-?: optional - sign
\d+: 1 or more digits 
,?: optional comma
\.?: optional period
\d+: 1 or more digits 
e?: optional e 
\.?: optional period
\d+: 1 ore more digits


^\d+\d+\d+$

"""

['3.14529', '', '-255.34', '', '128', '', '1.9e10', '', '123,340.00', '', '720p', '', '']
[['3.14529'], [], ['-255.34'], [], ['128'], [], ['1.9e10'], [], ['123,340.00'], [], [], [], []]
[[('', '.14529', '')], [], [('', '.34', '')], [], [('', '', '')], [], [('', '.9e10', 'e10')], [], [(',340', '.00', '')], [], [], [], []]


'\n^-?\\d+,?\\.?\\d+e?\\.?\\d+$\n\n-?: optional - sign\n\\d+: 1 or more digits \n,?: optional comma\n\\.?: optional period\n\\d+: 1 or more digits \ne?: optional e \n\\.?: optional period\n\\d+: 1 ore more digits\n\n\n^\\d+\\d+\\d+$\n\n'

In [38]:
"""
problem 2 matching phone numbers
"""

# ^\(?\d{3}\)?\s?\-?\d{3}\s?-?\d{4}$


txt = "415-555-1234"
x = re.findall("^(\d{1}\s)?\(?(\d{3})\)?", txt)
print(x) 

# solution 1?[\s-]?\(?(\d{3})\)?[\s-]?\d{3}[\s-]?\d{4}


[('', '415')]


In [43]:
"""
matching emails 
((\w+\.?\w+)\+?\.?\w+)@\w+(\.eu)?\.com
"""

txt = "tom.riddle+regexone@hogwarts.com"
x = re.findall("((\w+(\.\w+)?)\+?\.?\w+)@\w+(\.eu)?\.com", txt)
print(x) 
x = re.findall("^(\w+(\.\w+)?)(\+\w+)?@", txt)
print(x)


[('tom.riddle+regexone', 'tom.riddle', '.riddle', '')]
[('tom.riddle', '.riddle', '+regexone')]


In [44]:
"""
html
"""

txt = "<a>This is a link</a>"
x = re.findall("<(a|div).*", txt) # angle braket then a or div then anything after it 
print(x)

['a']


In [46]:
"""
Problem 5: Matching specific filenames 
"""

txt = "updated_img0912.png"
x = re.findall("^(\w+)\.(jpg|png|gif)$", txt) # 1 or more alpha numberic then period then one of the extensions 
print(x)

[('updated_img0912.png', 'updated_img0912', 'png')]


In [48]:
"""
trimming whitespace
"""

txt = "jumps over the lazy dog"
x = re.findall("^\s*(.*)\s*$", txt) # some whitespace, then anything, then some whitespace 
print(x)

['jumps over the lazy dog']


In [49]:
"""
log file searching
"""

txt = "E/( 1553):   at widget.List.fillFrom(ListView.java:709)"
x = re.findall("(\w+)\((\w+.java):([0-9]+)\)$", txt)
print(x)

[('fillFrom', 'ListView.java', '709')]


In [51]:
txt = "https://regexone.com/lesson/introduction#sectio"
x = re.findall("(\w+)://((\w+\.com|\w+-\w+.com|\w+)?):?/?(\d+)?", txt) 
print(x)

# simpler
# match \w+ until :// then match anything not : or / then optionally get port number
x = re.findall("(\w+)://([^:/]+):?/?(\d+)?", txt) 
print(x)

[('https', 'regexone.com', 'regexone.com', '')]
[('https', 'regexone.com', '')]
