In [1]:
#Regular expressions called regex are implemented as re Python

In [2]:
#regex pattern 
import re
regex=re.compile('\s+')

#'\s' denotes a white space character.
#By adding a '+', we wish to match at least one or more white spaces
#The above compiles a regualar expression pattern that can match at least one or more white space characters


In [3]:
#split a string separated by a regex

import re
text= """101 COM   Computer Science
205 MAT   Mathematics
189 ENG    English"""

#[course number] [course code] [course name] 
#The spacing between the words is not equal

#spilt the text around 1 or more space characters
re.split('\s+',text)

#or
regex.split(text)


['101',
 'COM',
 'Computer',
 'Science',
 '205',
 'MAT',
 'Mathematics',
 '189',
 'ENG',
 'English']

In [4]:
#Finding pattern matches using findall

#find all numbers within the text
print(text)


101 COM   Computer Science
205 MAT   Mathematics
189 ENG    English


In [5]:
regex_num=re.compile('\d+')
regex_num.findall(text)


['101', '205', '189']

In [6]:
#'\d' is a regualar expression for digits. Adding a '+' at the end mandates the presence of more than one digit
#'*' on the other hand, requires 0 or more digits in order to be found

In [7]:
#re.search() versus re.match()
#regex.serach() searches for the pattern in a given text
#it returns a particular matched object that contains the starting and end positions of the first occurrence of the pattern

#regex.match() also returns a match object. The difference is that it requires the pattern to be present at the beginning of the text


In [8]:
text2="""COM  computers
MAT    Mathematics      189"""

regex_num=re.compile('\d+')  #here the pattern to be searched is nothing but a number
s=regex_num.search(text2)

print('Starting Position: ',s.start())
print('Ending Position ', s.end())
print(text2[s.start() : s.end()])


Starting Position:  39
Ending Position  42
189


In [9]:
print(s.group())


189


In [10]:
m=regex_num.match(text2)
print(m)

None


In [11]:
#substitute one text with another: regex.sub()

text="""101 COM \t Computers
205 MAT \t Mathematics
189 ENG \t English"""

print(text)


101 COM 	 Computers
205 MAT 	 Mathematics
189 ENG 	 English


In [12]:
#even out the extra spaces and put all the words in a single line
regex=re.compile('\s+')
print(regex.sub(' ', text))


101 COM Computers 205 MAT Mathematics 189 ENG English


In [13]:
#or

print(re.sub('\s+', ' ', text))

101 COM Computers 205 MAT Mathematics 189 ENG English


In [14]:
#suppose we want to get rid of all the extra spaces but want the course entries in the same line itself 
#this can be done witha negative lookahead (?!\n). It checks for an upcoming newline character and excludes it from the pattern

#get rid of all the extra spaces except newline
regex=re.compile('((?!\n)\s+)')
print(regex.sub(' ', text))


101 COM Computers
205 MAT Mathematics
189 ENG English


In [15]:
#regex groups- lets you extract the desired match objects as individual items 

In [16]:
#Wish to extract the course number, code and name as separate items without regex groups-

text="""101 COM Computers
205 MAT Mathematics
189 ENG English"""

#1.extract all course numbers 
re.findall('[0-9]+', text)


['101', '205', '189']

In [17]:
#2.extract all course codes
re.findall('[A-Z]{3}', text)

['COM', 'MAT', 'ENG']

In [18]:
#3.extract all course names
re.findall('[A-Za-z]{4,}', text)

['Computers', 'Mathematics', 'English']

In [19]:
#extracting with regex groups-

#define the course pattern
course_pattern='([0-9]+)\s*([A-Z]{3})\s*([A-Za-z]{4,})'
re.findall(course_pattern, text)


[('101', 'COM', 'Computers'),
 ('205', 'MAT', 'Mathematics'),
 ('189', 'ENG', 'English')]

In [20]:
#greedy matching in regex- default behaviour of regular expresions is to be greedy. It tries to extract as much as possible until it conforms to a pattern even when a smaller part wouldve been syntactically sufficient 

text="<body> Regex Greedy Matching Example </body> "
re.findall('<.*>', text)


['<body> Regex Greedy Matching Example </body>']

In [21]:
#it extracted the whole string instead of matching till the first occurrence of '>'(end of the fist body tag). this is the by default greedy behaviour of regex
#lazy matching takes as little as possible. This can be done by adding '?' at the end of the pattern

re.findall('<.*?>', text)


['<body>', '</body>']

In [22]:
re.search('<.*?>', text).group()

'<body>'

In [23]:
#regex examples

text="what is your name?"
print(re.findall('[^\s]', text))  #one character other than a whitespace


['w', 'h', 'a', 't', 'i', 's', 'y', 'o', 'u', 'r', 'n', 'a', 'm', 'e', '?']


In [24]:
print(re.findall('[^^^\s]', text))   # three characters other than a whitespace

['w', 'h', 'a', 't', 'i', 's', 'y', 'o', 'u', 'r', 'n', 'a', 'm', 'e', '?']


In [25]:
#match one or more occurrences. use r before the '' for matching occurrences
print(re.findall(r'Co+l','So Cooool'))

['Cooool']


In [26]:
#match word boundaries- \b commonly used to detect and match the beginning or end of a word
#example the regex \btoy will match 'toy' in 'toy cat' and not in 'tolstoy' (use toy\b)


In [27]:
re.findall(r'\btoy','play toy broke toys')

['toy', 'toy']

In [28]:
re.findall(r'\Btoy\b', 'tolstoy toys')

['toy']