# Regular Expressions (Regex)

In [1]:
import re

## Regex in Customer Support

### Usage of Regex (Regular Expression)

In [2]:
text1 = 'codebasics: you ask lot of questions 😠  1235678912, abc@xyz.com'
text2 = 'codebasics: here it is: (123)-567-8912, abX_82@xyz.com'
text3 = 'codebasics: yes, phone: 1235678912 email: abc@xyz.io'

In [3]:
pattern1 = '\d{10}'

re.findall(pattern1, text1)

['1235678912']

In [4]:
pattern2 = '\(\d{3}\)-\d{3}-\d{4}'

re.findall(pattern2, text2)

['(123)-567-8912']

In [5]:
pattern3 = '\(\d{3}\)-\d{3}-\d{4}|\d{10}'

match1 = re.findall(pattern3, text1)
match2 = re.findall(pattern3, text2)
match3 = re.findall(pattern3, text3)

print(match1, match2, match3)

['1235678912'] ['(123)-567-8912'] ['1235678912']


In [6]:
email_pattern = '[a-z0-9A-Z_]*@[a-z.]*'

email1 = re.findall(email_pattern, text1)
email2 = re.findall(email_pattern, text2)
email3 = re.findall(email_pattern, text3)

print(email1, email2, email3)

['abc@xyz.com'] ['abX_82@xyz.com'] ['abc@xyz.io']


### Retrieving order number

In [7]:
chat1='Codebasics: Hello, I am having an issue with my order # 412345982'
chat2='Codebasics: I have a problem with my order number 412889912'
chat3='codebasics: My order 412785624 is having an issue, I was charged 300$ when online it says 280$'

chats = [chat1, chat2, chat3]
pattern = 'order[^\d]*(\d*)'

for chat in chats:
    matches = re.findall(pattern, chat)
    print(matches)

['412345982']
['412889912']
['412785624']


In [8]:
def get_pattern_match(pattern, text):
    matches = re.findall(pattern, text)
    if(matches):
        return matches[0]

In [9]:
get_pattern_match('order[^\d]*(\d*)', chat1)

'412345982'

### Retrieve Email id and Phone no.

In [10]:
chat1 = 'codebasics: you ask lot of questions 😠  1235678912, abc@xyz.com'
chat2 = 'codebasics: here it is: (123)-567-8912, abX_82@xyz.com'
chat3 = 'codebasics: yes, phone: 1235678912 email: abc@xyz.io'

#### ---Email id---

In [11]:
get_pattern_match('[a-z0-9A-Z_]*@[a-z.]*', chat1)

'abc@xyz.com'

In [12]:
get_pattern_match('[a-z0-9A-Z_]*@[a-z.]*', chat2)

'abX_82@xyz.com'

In [13]:
emails_phone_no = [chat1, chat2, chat3]
for chat in emails_phone_no:
    print(get_pattern_match('[a-z0-9A-Z_]*@[a-z.]*', chat))

abc@xyz.com
abX_82@xyz.com
abc@xyz.io


#### ---Phone number---

In [14]:
get_pattern_match('\(\d{3}\)-\d{3}-\d{4}|\d{10}', chat1)

'1235678912'

In [15]:
for chat in emails_phone_no:
    matches = get_pattern_match('(\(\d{3}\)-\d{3}-\d{4})|(\d{10})', chat)
    print(matches)

('', '1235678912')
('(123)-567-8912', '')
('', '1235678912')


## Regex for Information Extraction

In [20]:
text='''
Born	Elon Reeve Musk
June 28, 1971 (age 50)
Pretoria, Transvaal, South Africa
Citizenship	
South Africa (1971–present)
Canada (1971–present)
United States (2002–present)
Education	University of Pennsylvania (BS, BA)
Title	
Founder, CEO and Chief Engineer of SpaceX
CEO and product architect of Tesla, Inc.
Founder of The Boring Company and X.com (now part of PayPal)
Co-founder of Neuralink, OpenAI, and Zip2
Spouse(s)	
Justine Wilson
​
​(m. 2000; div. 2008)​
Talulah Riley
​
​(m. 2010; div. 2012)​
​
​(m. 2013; div. 2016)
'''

In [36]:
# Age
get_pattern_match(r'age (\d+)', text)

'50'

In [37]:
# Born person
get_pattern_match(r'Born(.*)\n', text).strip()

'Elon Reeve Musk'

In [38]:
# Born Date
get_pattern_match(r'Born.*\n(.*)', text)

'June 28, 1971 (age 50)'

In [34]:
get_pattern_match('Born.*\n(.*)\(age', text)

'June 28, 1971 '

In [39]:
# Birth Place
get_pattern_match(r'\(age.*\n(.*)', text).strip()

'Pretoria, Transvaal, South Africa'

In [78]:
def extract_personal_info(text):
    age = get_pattern_match(r'age (\d+)', text)
    Person_name = get_pattern_match(r'Born(.*)\n', text).strip()
    Birth_date = get_pattern_match(r'Born.*\n(.*)\(age', text).strip()
    Birth_place = get_pattern_match(r'\(age.*\n(.*)', text).strip()
    Citizenship = get_pattern_match(r'Citizenship.*\n(.*)\n(.*)\n(.*)|Nationality.(.*)', text)
    education = get_pattern_match(r'Education\t(.*)|Alma.*\n(.*)\n(.*)\n(.*)', text)
    
    return {
        'Age': int(age),
        'Name': Person_name,
        'Birth_date': Birth_date,
        'Birth_place': Birth_place,
        'Citizenship': Citizenship,
        'Education': education
    }

In [79]:
extract_personal_info(text)

{'Age': 50,
 'Name': 'Elon Reeve Musk',
 'Birth_date': 'June 28, 1971',
 'Birth_place': 'Pretoria, Transvaal, South Africa',
 'Citizenship': ('South Africa (1971–present)',
  'Canada (1971–present)',
  'United States (2002–present)',
  ''),
 'Education': ('University of Pennsylvania (BS, BA)', '', '', '')}

In [80]:
new_text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [81]:
extract_personal_info(new_text)

{'Age': 64,
 'Name': 'Mukesh Dhirubhai Ambani',
 'Birth_date': '19 April 1957',
 'Birth_place': 'Aden, Colony of Aden',
 'Citizenship': ('', '', '', 'Indian'),
 'Education': ('',
  "St. Xavier's College, Mumbai",
  'Institute of Chemical Technology (B.E.)',
  'Stanford University (drop-out)')}