In [1]:
# Regex

In [2]:
import re

In [3]:
# match() checks for a match that is at the beginning of the string and
# returns a boolean.
# search() checks for a match anywhere in the string and returns a boolean.

text =  "This is a good day"

if re.search("good",text):
    print("Wonderful")
else:
    print("Alas :(")

Wonderful


In [4]:
# Tokenizing where string is seperated into substrings based on patterns.
# Used is natural lang processing

# The findall() and split() functions will parse the string for us and 
# return chunks.
text = "Amy works diligently. Amy gets good grades. Our student Amy is successful."

re.split("Amy",text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is successful.']

In [7]:
re.findall("Amy",text)

['Amy', 'Amy', 'Amy']

In [8]:
# Achors -> specify the start and/or end of the string that you are trying
# to match.The caret character ^ means start and the dollar $ char means end
# If you put ^ before a string, it means that the text must start with
# the string you specify. For ending, you have to put $ char after the string
# it means that it must end with the string specify.

text = "Amy works diligently. Amy gets good grades. Our student Amy is successful."

re.search("^Amy",text)


<re.Match object; span=(0, 3), match='Amy'>

In [9]:
# Patterns and Character classes

In [10]:
grades = "AABDDCAAB"
re.findall("B",grades)

['B', 'B']

In [11]:
# If we want to count the number of A's or B's in the list we can't use
# "AB" since this used to match all A's followed immediately by a B.
# Instead we put the chars in square brackets.

re.findall("[AB]",grades)

['A', 'A', 'B', 'A', 'A', 'B']

In [12]:
# This is called the set operator. You can also include a range of characters
# which are ordered alphanumerically.

re.findall("[A][B-C]",grades)

['AB', 'AB']

In [13]:
# We can do the same with Pipe operator (OR)

re.findall("AB|AC",grades)

['AB', 'AB']

In [14]:
# We can also use the caret ^ operator to negate our results inside set ( [] ) operator.

re.findall("[^A]",grades)

['B', 'D', 'D', 'C', 'B']

In [15]:
# Quantifiers -> specify the number of times you want a pattern to be matched
# in order to match. The most basic quantifier is expressed as e{m,n}
# where e is the expression or character to be matched, m is the min number
# of occurences of e and n is the max number of times the item could be matched

re.findall("A{2,10}",grades)


['AA', 'AA']

In [16]:
# * -> used to match 0 or more times,
# ? -> used to match 1 or more times
# + -> used to match 1 or more times
# \w -> used to match any letter, including digits and numbers.
# \s -> used to match any whitespace

In [19]:
# Reading a dataset
with open('dataset/ferpa.txt') as file:
        wiki = file.read()

In [20]:
print(wiki)

Overview[edit]
FERPA gives parents access to their child's education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student's consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.

Other regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student's personally identifiable information without the student's consent.[2]

Examples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student's grades or behavior,

In [21]:
re.findall("[a-zA-z]{1,100}\[edit\]",wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [22]:
re.findall("[\w]{1,100}\[edit\]",wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [24]:
re.findall("[\w]*\[edit\]",wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [26]:
re.findall("[\w ]*\[edit\]",wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [29]:
for title in re.findall("[\w ]*\[edit\]",wiki):
    print(re.split("[\[]",title)[0])

Overview
Access to public records
Student medical records


In [30]:
# Groups.
# To group patterns we use parenthesis ()

In [31]:
re.findall("([\w ]*)(\[edit\])",wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [32]:
# finditer() retuns the list of Match objects instead of string itself.

for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [33]:
# groups() returns a tuple of the group.

for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [34]:
# Labeling or naming groups.

for item in re.finditer("(?P<title>[\w ]*)(?P<edit>\[edit\])",wiki):
    print(item.groupdict()['title'])

Overview
Access to public records
Student medical records


In [35]:
print(item.groupdict())

{'title': 'Student medical records', 'edit': '[edit]'}


In [36]:
# \d - used for matching any digit.

In [37]:
# Look-ahead and Look-behind

In [38]:

# ?= -> used to look ahead i.e we dont want it in the output match obj

for item in re.finditer("(?P<title>[\w ]*)(?=\[edit\])",wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(8, 8), match=''>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(2739, 2739), match=''>
<re.Match object; span=(3692, 3715), match='Student medical records'>
<re.Match object; span=(3715, 3715), match=''>
