# Regular Expressions

In [1]:
import re

text = "The film Titanic was released in 1998"
# The above regex expression will match the text string, 
#since we are trying to match a string of any length and any character, it gives the complete sentence. 
result = re.match(r".*", text)
type(result) #shows "re.match" if matched else gives NoneType

re.Match

In [2]:
result.group(0)

'The film Titanic was released in 1998'

In [3]:
# o match a string with a length of at least 1, the following regex expression is used:
text = ""
result = re.match(r".+", text)
type(result)

NoneType

In [4]:
# The match function can be used to find any alphabet letters within a string.
text = "The film Titanic was released in 1998"
result = re.match(r"[a-zA-z]+", text)
result.group(0) # After the word The there is a space, which is not treated as an alphabet letter, 
#therefore the matching stopped and the expression returned just "The", which is the first match.

'The'

In [5]:
text = "1998 was the year when the film titanic was released"
result = re.match(r"[a-zA-z]+", text)
type(result)

NoneType

##### search Function

In [6]:
'''The search function is similar to the match function i.e. it tries to match the specified pattern. However, 
unlike the match function, it matches the pattern globally instead of matching only the first element. Therefore, 
the search function will return a match even if the string doesn't contain an alphabet at thestart of the string 
but contains an alphabet elsewhere in the string, as shown below:'''

text = "1998 was the year when the film titanic was released"
result = re.search(r"[a-zA-z]+", text)
print(result.group(0))

was


In [7]:
'''To check if a string starts with a specific word, you can use the carrot key i.e. ^ followed by the word 
to match with the search function as shown below. Suppose we have the following string:'''

text = "1998 was the year when the film titanic was released"
if re.search(r"^1998", text):
    print("Match found")
else:
    print("Match not found")

Match found


In [8]:
'''To check whether a string ends with a specific word or not, we can use the word in the regular expression, 
followed by the dollar sign. The dollar sign marks the end of the statement.'''

text = "1998 was the year when the film titanic was released"
if re.search(r"1998$", text):
    print("Match found")
else:
    print("Match not found")

Match not found


##### Substitute Function

In [9]:
'''Till now we have been using regex to find if a pattern exists in a string. Let's move forward with another 
advanced regex function i.e. substituting text in a string. The sub function is used for this purpose.'''

text = "The film Pulp Fiction was released in year 1994"
result = re.sub(r"Pulp Fiction", "Forrest Gump", text)

In [10]:
print(result)

The film Forrest Gump was released in year 1994


In [11]:
text = "The film Pulp Fiction was released in year 1994"
result = re.sub(r"[a-z]", "X", text) #substitutes with a new string for all small characters
print(result)

TXX XXXX PXXX FXXXXXX XXX XXXXXXXX XX XXXX 1994


In [12]:
'''result = re.sub(r"[a-zA-Z]", "X", text) or you can pass the additional parameter flags to the sub function and 
set its value to re.I which refers to case insensitive, as follows:'''
text = "The film Pulp Fiction was released in year 1994"
result1 = re.sub(r"[a-zA-Z]", "X", text)
result2 = re.sub(r"[a-z]", "X", text, flags=re.I)
print(result1, " ", result2)

XXX XXXX XXXX XXXXXXX XXX XXXXXXXX XX XXXX 1994   XXX XXXX XXXX XXXXXXX XXX XXXXXXXX XX XXXX 1994


In [13]:
'''The regex expression to find digits in a string is \d. This pattern can be used to remove digits from a string 
by replacing them with an empty string of length zero as shown below:'''

text = "The film Pulp Fiction was released in year 1994"
result = re.sub(r"\d", "", text)
print(result)

The film Pulp Fiction was released in year 


In [14]:
text = "The film Pulp Fiction was released in year 1994"
result = re.sub(r"[a-z]", "", text, flags=re.I) # Removing Alphabets
print(result)

        1994


In [15]:
'''If you want to remove all the word characters (letters and numbers) from a string and keep the remaining 
characters, you can use the \w pattern in your regex and replace it with an empty string of length zero, as 
shown below:'''

text = "The film, '@Pulp Fiction' was ? released in % $ year 1994."
result = re.sub(r"\w","", text, flags = re.I)
print(result)

 , '@ '  ?   % $  .


In [16]:
text = "The film, '@Pulp Fiction' was ? released in % $ year 1994."
result = re.findall(r"[\w]+", text)
print(result)

['The', 'film', 'Pulp', 'Fiction', 'was', 'released', 'in', 'year', '1994']


In [17]:
# To remove all the non-word characters, the \W pattern can be used as follows:

text = "The film, '@Pulp Fiction' was ? released in % $ year 1994."
result = re.sub(r"\W", " ", text, flags=re.I)
print(result)

The film    Pulp Fiction  was   released in     year 1994 


In [18]:
# Grouping Multiple Patterns
text = "The film, '@Pulp Fiction' was ? released _ in % $ year 1994."
result = re.sub(r"[,@\'?\.$%_]", "", text, flags=re.I)
print(result)

The film Pulp Fiction was  released  in   year 1994


## Removing Multiple Spaces

Sometimes, multiple spaces appear between words as a result of removing words or punctuation. For instance, in the output of the last example, there are multiple spaces between in and year. These spaces can be removed using the \s pattern, which refers to a single space.

In [19]:
text = "The film Pulp Fiction was  released  in   year 1994"
result = re.sub(r"\s+"," ", text, flags = re.I)
print(result)

The film Pulp Fiction was released in year 1994


In [20]:
text = "         The film Pulp Fiction was released in year 1994"
result = re.sub(r"^\s+", "", text) # ^ is used to remove Spaces at the begining of the sentence
print(result)

The film Pulp Fiction was released in year 1994


In [21]:
text = "The film Pulp Fiction was released in year 1994      "
result = re.sub(r"\s+$", "", text) # $ is used to remove Spaces at the end of the sentence
print(result)

The film Pulp Fiction was released in year 1994


In [22]:
text = "The film Pulp Fiction     s was b released in year 1994"
result = re.sub(r"\s+[a-zA-Z]\s+", " ", text) # Removing a Single Character
print(result)

The film Pulp Fiction was released in year 1994


#### Split Function

In [23]:
text = "The film      Pulp   Fiction was released in year 1994      "
result = re.split(r"\s+", text) # Splitting a String with respect to space
print(result)

['The', 'film', 'Pulp', 'Fiction', 'was', 'released', 'in', 'year', '1994', '']


In [24]:
text = "The film, Pulp Fiction, was released in year 1994"
result = re.split(r"\,", text) # split with respect to ','
print(result)

['The film', ' Pulp Fiction', ' was released in year 1994']


In [25]:
# the findall function returns a list that contains all the matched utterances as shown below:

text = "I want to buy a mobile between 200 and 400 euros"
result = re.findall(r"\d+", text)
print(result)

['200', '400']
