# text processing with regular expressions
- information extraction
- information hiding/masking
- text cleaning



    [0-9] = anything between 0 to 9 - \d
    [a-z] = anything between small a to small z
    [A-Z] = anything between capital A to capital Z
    \s = space
    {} = curly braces can be used to specify size of the word

In [1]:
import re

In [2]:
data = "my mobile number is 9898785645 and you mobile number is 9898784545 thank you."

In [3]:
pattern = '[0-9]{10}'
re.findall(pattern,data)

['9898785645', '9898784545']

In [4]:
re.sub(pattern,"*********",data)

'my mobile number is ********* and you mobile number is ********* thank you.'

In [5]:
re.sub(pattern,"",data)

'my mobile number is  and you mobile number is  thank you.'

In [6]:
data = """ my birthday is 30-02-2000 and your birthday is 31-04-2002 and his birthday is 12-5-1990 and
his friend's birthday is 15/05/1999 thank you for your email, please resply me back on
the id anshu_pandey@abccompany.com and also keep john.weka@yourcompany.com in cc and you may wanna
include cera@gmail.com as well."""
print(data)

 my birthday is 30-02-2000 and your birthday is 31-04-2002 and his birthday is 12-5-1990 and
his friend's birthday is 15/05/1999 thank you for your email, please resply me back on
the id anshu_pandey@abccompany.com and also keep john.weka@yourcompany.com in cc and you may wanna
include cera@gmail.com as well.


In [7]:
pattern = "[0-9]{2}-[0-9]{2}-[0-9]{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002']

In [8]:
pattern = "\d{2}-\d{2}-\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002']

In [9]:
pattern = "\d{1,2}-\d{1,2}-\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002', '12-5-1990']

In [10]:
pattern = "\d{1,2}-\d{1,2}-\d{4}|\d{1,2}/\d{1,2}/\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002', '12-5-1990', '15/05/1999']

In [11]:
pattern = "[0-9a-zA-Z_.]+@[0-9a-zA-Z._]+"
re.findall(pattern,data)

['anshu_pandey@abccompany.com', 'john.weka@yourcompany.com', 'cera@gmail.com']

In [23]:
data = "my birthday is 01 JUL 2021 and what is your birthday?"
pattern = "\d{1,2}\s[a-zA-Z]{3}\s\d{4}"
re.findall(pattern, data)

['01 JUL 2021']

# Grouping can be done using ()

In [14]:
data = "my birthday is 01 JUL 2021 and what is your birthday?"
pattern = "\d{1,2}\s[a-zA-Z]{3}\s\d{4}"

In [15]:
re.search(pattern,data)

<re.Match object; span=(15, 26), match='01 JUL 2021'>

In [16]:
pattern = "(\d{1,2})\s([a-zA-Z]{3})\s(\d{4})"
match = re.search(pattern,data)
print(match.group())

01 JUL 2021


In [17]:
print(match.group(1))
print(match.group(2))
print(match.group(3))

01
JUL
2021


In [18]:
re.sub(pattern,match.group(1)+"-07-"+match.group(3),data)

'my birthday is 01-07-2021 and what is your birthday?'

In [27]:
data = "my email id is anshu.pandey@abccompany.com what is yours?"
pattern = "([0-9a-zA-Z_.]+)@([0-9a-zA-Z._]+)"
match = re.search(pattern,data)
print(match.group(0))
print(match.group(1))
print(match.group(2))

anshu.pandey@abccompany.com
anshu.pandey
abccompany.com


In [28]:
pattern = "(?P<username>[0-9a-zA-Z_.]+)@(?P<host>[0-9a-zA-Z._]+)"
match = re.search(pattern,data)
print(match.group(0))
print(match.group("username"))
print(match.group('host'))

anshu.pandey@abccompany.com
anshu.pandey
abccompany.com
