#  Regular Expression

### Python Built-in Module for Regular Expressions
Python has a built-in module to work with regular expressions called “re”. Some common methods from this module are-

- re.match()
- re.search()
- re.findall()

In [8]:
### re.match(pattern, string)
### The re.match function returns a match object on success and none on failure.

import re

#match a word at the beginning of a string

result = re.match('The',r'The greatest glory in living lies not in never falling, but in rising every time we fall')
print(result)

<re.Match object; span=(0, 3), match='The'>


In [9]:
print(result.group()) #returns the total matches

The


In [10]:
result_2 = re.match('living',r'The greatest glory in living lies not in never falling, but in rising every time we fall')
print(result_2)

None


In [11]:
### re.search(pattern, string)
### Matches the first occurrence of a pattern in the entire string(and not just at the beginning).

result = re.search('fall',r'The greatest glory in living lies not in never falling, but in rising every time we fall')
print(result.group())

fall


In [12]:
### re.findall(pattern, string)
### It will return all the occurrences of the pattern from the string. I would recommend you to use re.findall() always, it can work like both re.search() and re.match().

result = re.findall('fall',r'The greatest glory in living lies not in never falling, but in rising every time we fall')
print(result)

['fall', 'fall']


## Special Sequences in Regular Expressions

In [13]:
## \b
## \b returns a match where the specified pattern is at the beginning or at the end of a word.

str = r'The greatest glory in living lies not in never falling, but in rising every time we fall'

#Check if there is any word that ends with "est"

x = re.findall(r"est\b", str)
print(x)

['est']


In [15]:
### \d
### \d returns a match where the string contains digits (numbers from 0-9).

str = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#Check if the string contains any digits (numbers from 0-9):

x = re.findall("\d", str)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['8', '1', '4', '6']
Yes, there is at least one match!


In [17]:
### \d
### \d returns a match where the string contains digits (numbers from 0-9).

str_ = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#Check if the string contains any digits (numbers from 0-9):

x = re.findall("\d", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['8', '1', '4', '6']
Yes, there is at least one match!


In [18]:
# Check if the string contains any digits (numbers from 0-9):
# adding '+' after '\d' will continue to extract digits till encounters a space

x = re.findall("\d+", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")
 

['8', '146']
Yes, there is at least one match!


In [19]:
### \D
### \D returns a match where the string does not contain any digit. It is basically the opposite of \d.

str_ = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#Check if the word character does not contain any digits (numbers from 0-9):

x = re.findall("\D", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['O', 'u', 'r', ' ', 's', 'o', 'l', 'a', 'r', ' ', 's', 'y', 's', 't', 'e', 'm', ' ', 'i', 's', ' ', 'm', 'a', 'd', 'e', ' ', 'u', 'p', ' ', 'o', 'f', ' ', 'a', ' ', 's', 't', 'a', 'r', ',', ' ', 't', 'h', 'e', ' ', 'S', 'u', 'n', ' ', ' ', 'p', 'l', 'a', 'n', 'e', 't', 's', ',', ' ', ' ', 'm', 'o', 'o', 'n', 's']
Yes, there is at least one match!


In [20]:
#Check if the word does not contain any digits (numbers from 0-9):

x = re.findall("\D+", str)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['Our solar system is made up of a star, the Sun ', ' planets, ', ' moons']
Yes, there is at least one match!


In [21]:
### \w
### \w helps in extraction of alphanumeric characters only (characters from a to Z, digits from 0-9, and the underscore _ character)

str_ = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#returns a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character)

x = re.findall("\w+",str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")
    

['Our', 'solar', 'system', 'is', 'made', 'up', 'of', 'a', 'star', 'the', 'Sun', '8', 'planets', '146', 'moons']
Yes, there is at least one match!


In [22]:
### \W
### \W returns match at every non-alphanumeric character. Basically opposite of \w.

str_ = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#returns a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

x = re.findall("\W", str)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")
    

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', ',', ' ', ' ']
Yes, there is at least one match!


# Metacharacters in Regular Expression

In [23]:
## (.) matches any character (except newline character)

str_ = "rohan and rohit recently published a research paper!"

#Search for a string that starts with "ro", followed by any number of characters

x = re.findall("ro.", str_)           #searches one character after ro
x2 = re.findall("ro...", str_)        #searches three characters after ro

print(x)
print(x2)

['roh', 'roh']
['rohan', 'rohit']


In [24]:
### (^) starts with
### It checks whether the string starts with the given pattern or not.

str_ = "Data Science"

#Check if the string starts with 'Data':

x = re.findall("^Data", str_)

if (x):
    print("Yes, the string starts with 'Data'")
else:
    print("No match")

Yes, the string starts with 'Data'


In [25]:
# try with a different string

str2 = "Big Data"

#Check if the string starts with 'Data':

x2 = re.findall("^Data", str2)

if (x2):
    print("Yes, the string starts with 'data'")
else:
    print("No match")
    

No match


In [26]:
### ($) ends with
### It checks whether the string ends with the given pattern or not.

str_ = "Data Science"

#Check if the string ends with 'Science':

x = re.findall("Science$", str_)

if (x):
    print("Yes, the string ends with 'Science'")
else:
    print("No match")

Yes, the string ends with 'Science'


In [27]:
### (*) matches for zero or more occurrences of the pattern to the left of it
str_ = "easy easssy eay ey"

#Check if the string contains "ea" followed by 0 or more "s" characters and ending with y

x = re.findall("eas*y", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['easy', 'easssy', 'eay']
Yes, there is at least one match!


In [29]:
### (+) matches one or more occurrences of the pattern to the left of it
### Check if the string contains "ea" followed by 1 or more "s" characters and ends with y

x = re.findall("eas+y", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['easy', 'easssy']
Yes, there is at least one match!


In [30]:
### (?) matches zero or one occurrence of the pattern left to it.
x = re.findall("eas?y",str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['easy', 'eay']
Yes, there is at least one match!


In [32]:
### (|) either or
str_ = "Our solar system is made up of a star, the Sun 8 planets, 146 moons"

#Check if the string contains either "data" or "India":

x = re.findall("star|moon", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['star', 'moon']
Yes, there is at least one match!


In [33]:
# try with a different string

str_ = "Our solar system is made up of a millions of star"

#Check if the string contains either "data" or "India":

x = re.findall("star|moon", str_)
print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['star']
Yes, there is at least one match!


> End of Program

____