In [None]:
"""
Regular expressions (or regexes in shorthand) are essentially a language of their own and are not unique to Python. 
What they do is allow for complicated searches through text according to various criteria. 
If you're looking at a large document of text it's easy enough to search for the word "Shiv Nadar". 
But what if I want to search for a pattern rather than a particualr word such as xxx-xxx-xxxx where 
I want all the x's to be numbers? This would be a great way to find a phone number but I'd have to do a 
lot of Cmd+F (Ctrl+F) searches if I searched for all possibilities of 10 digit phone numbers.

Regexes allow us to construct a generic text pattern that will then be matched through the entire body of text. 
There is a specific language that is used to build a regex and this language is both extremely powerful 
and complicated.

You have the ability to construct very complicated and detailed regular expressions. 
However, as with any tool that is extremely powerful, and difficult to debug.
It is easy to construct a regular expression that does far more (or less) than you expect and have 
it generate incorrect answers. Remember that the best way to build complex 
regular expressions is to test, test, and test some more.
"""

In [None]:
"""Regular expressions in Python are implmented in the re package. 
There are a few basic functions in the package that we will use:

re.match() : Determine if the RE matches at the beginning of the string.
re.search() : Scan through a string, looking for any location where this RE matches.
re.findall() : Find all substrings where the RE matches, and returns them as a list.
re.finditer() : Find all substrings where the RE matches, and returns them as an iterator object.
Now, let's go over an example so this is less abstract. 
We'll start with something easy - making a direct match to an explicit string (my name)
using all of the different re methods
"""

In [1]:
import re

In [29]:
text_sample  = "Hi my ABD  name ABD is ABD!"
print(re.match('ABD', text_sample))
print(re.search('ABD', text_sample))
print(re.findall('ABD', text_sample))
print(re.finditer('ABD', text_sample))

None
<re.Match object; span=(6, 9), match='ABD'>
['ABD', 'ABD', 'ABD']
<callable_iterator object at 0x7f043c25d4b0>


In [31]:
for text in re.finditer('ABD', text_sample) :
    print(text.group(), text.span())

ABD (6, 9)
ABD (16, 19)
ABD (23, 26)


In [30]:
search_results = re.search('ABD', text_sample)
print(search_results.start())
print(search_results.end())
print(search_results.span())
print(search_results.group())

6
9
(6, 9)
ABD


In [None]:
"""re.finditer is doing essentially the same thing as re.search but 
it's wrapping the results in an iterator. 
(Reminder question: why would an iterator be used?)
"""

In [32]:
text_sample_3 = "ABD has enrolled in SNU. ABD opted for DOM305."
print(re.findall('ABD', text_sample_3))

['ABD', 'ABD']


In [33]:
for text in re.finditer('ABD', text_sample_3) :
    print(text.group(), text.span())


ABD (0, 3)
ABD (25, 28)


In [36]:
# Program to extract numbers from a string
string = 'hello 12 hi 89. Howdy 34. This year is 2022'
#syntax for searching any number from a text or string
pattern = '\d+' 

#trying to find all the numbers existing in the string
result = re.findall(pattern, string) 
print(result)

for text in re.finditer(pattern, string) :
    print(text.group(), text.span())

['12', '89', '34', '2022']
12 (6, 8)
89 (12, 14)
34 (22, 24)
2022 (39, 43)


In [37]:
"""
re.split method splits the string where there is a match and returns a 
list of strings where the splits have occurred.
"""
string = 'Twelve:12 Eighty nine:89.'
string2 = 'Twelve is 12 and Eighty nine is 89.'
pattern = '\d+'

result = re.split(pattern, string2) 
print(result)
#say you are given with a text containing numbers. You are asked to plot the "wordcloud" w/o the numbers.
# you can have the entire text split up by the numbers using re.split() and then join the list of strings.
#Then you can visualize the resultant string/text as wordcloud.

result2 = re.split(pattern, string)
print(result2)

['Twelve is ', ' and Eighty nine is ', '.']
['Twelve:', ' Eighty nine:', '.']


In [40]:
# Program to remove all whitespaces

# multiline string
string = 'abc 12\
de 23 \n f45 6'
print(string)

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.sub(pattern, replace, string) 
print(new_string)

new_string2 = re.sub("ABD","Shubham",text_sample_3)
print(new_string2)

abc 12de 23 
 f45 6
abc12de23f456
Shubham has enrolled in SNU. Shubham opted for DOM305.


In [41]:
string = "Python is fun"

# check if 'Python' is at the beginning
match = re.search('Julia', string)

print(match)
if match:
    print("pattern found inside the string")
    x = 1
else:
    print("pattern not found")
    x= 0

# x is a dummy variable if search string is found (value = 1); if not found (value = 0)
print(x)

None
pattern not found
0


In [57]:
# The group() method returns the part of the string where there is a match.


string = 'that is 1111 number'

# r means raw string
pattern = r'\d{3}'

# match variable contains a Match object.
match = re.search(pattern, string)

if match:
  print(match.group())
else:
  print("pattern not found")

111


In [58]:
match = re.findall(pattern, string) 
print(match)


['111']


In [79]:
string = '39801 356, 2102 1111'

# Three digit number followed by space followed by two digit number
pattern = r'\d{4} \d{3}'

# match variable contains a Match object.
match_1 = re.search(pattern, string) 
print(match_1)

match_2 = re.findall(pattern, string) 
print(match_2)

match_3 = re.finditer(pattern, string)
for text in match_3 :
    print(text.group(), text.span())

<re.Match object; span=(1, 9), match='9801 356'>
['9801 356', '2102 111']
9801 356 (1, 9)
2102 111 (11, 19)


In [84]:
# Positive Integers
pattern = '^\d+$'
str_true = ('123', '1', )
str_false = ('abc', '1.1', )

for t in str_true:
    print(re.match(pattern, t))

for t in str_false:
    print(re.match(pattern, t))


<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 1), match='1'>
None
None


In [88]:
str_val = ('124','5.5','6','djf',54)
pattern = '^\d+$'
for t in str_val:
    print(re.match(pattern, str(t)))

<re.Match object; span=(0, 3), match='124'>
None
<re.Match object; span=(0, 1), match='6'>
None
<re.Match object; span=(0, 2), match='54'>


In [93]:
# Negative Integers
pattern = '^-\d+$'
str_true = ('-123', '-1', )
str_false = ('123', '-abc', '-1.1', )

for t in str_true:
    print(re.match(pattern, t))
    print(re.match(pattern, str(t)).span())

for t in str_false:
    print(re.match(pattern, t))

<re.Match object; span=(0, 4), match='-123'>
(0, 4)
<re.Match object; span=(0, 2), match='-1'>
(0, 2)
None
None
None


In [97]:
# positive numbers
pattern = '^\d*\.{0,1}\d+$'

str_true = ('1', '123', '1.234', )            
str_false = ('-abc', '-123', '-123.0')
for t in str_true:
    print(re.match(pattern, t))

for t in str_false:
    print(re.match(pattern, t))


<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 5), match='1.234'>
None
None
None


In [100]:
# all integers
pattern = '^-{0,1}\d+$'
str_true = ('-123', '-1', '1', '123',)
str_false = ('123.0', '-abc', '-11', )

for t in str_true:
    print(re.match(pattern, t).span(), re.match(pattern, t).group())

for t in str_false:
    print(re.match(pattern, t))

(0, 4) -123
(0, 2) -1
(0, 1) 1
(0, 3) 123
None
None
<re.Match object; span=(0, 3), match='-11'>


In [104]:
# negative numbers
pattern = '^-\d*\.{0,1}\d+$'
str_true = ('-1', '-123', '-123.0', '-.56' )
str_false = ('-abc', '1', '123', '1.234', )

for t in str_true:
    print(re.match(pattern, t))

for t in str_false:
    print(re.match(pattern, t))

<re.Match object; span=(0, 2), match='-1'>
<re.Match object; span=(0, 4), match='-123'>
<re.Match object; span=(0, 6), match='-123.0'>
<re.Match object; span=(0, 4), match='-.56'>
None
None
None
None


In [107]:
#all numbers 
pattern = '^-{0,1}\d*\.{0,1}\d+$'
# what does the -{0,1} mean?
# the -{0,1} means that the - sign can appear 0 or 1 times.

str_true = ('1', '123', '1.234', '-123', '-123.0')
str_false2 = ('-abc',)

for t in str_true:
    print(re.match(pattern, t))

for t in str_false2:
    print(re.match(pattern, t))




<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 5), match='1.234'>
<re.Match object; span=(0, 4), match='-123'>
<re.Match object; span=(0, 6), match='-123.0'>
None
