# Regular Expressions (Regex) with Examples in Python and Pandas

In [1]:
import re

In [2]:
text = 'There was further decline of the UHC'

#### re.findall(pattern, text)

In [3]:
re.findall("the", text)

['the', 'the']

In [4]:
re.findall("the", text, 
           flags=re.I)

['The', 'the', 'the']

#### re.search(pattern, text)

In [5]:
re.search("the", text)

<re.Match object; span=(13, 16), match='the'>

In [6]:
match_obj = re.search("the", text)

#index span of matched string
print(match_obj.span())

#the matched string
print(match_obj.group())

#start position of match
print(match_obj.start())

#end position of match
print(match_obj.end())

(13, 16)
the
13
16


#### re.match(pattern, text)

In [7]:
re.match('the', text)

In [8]:
text = 'The focus is on 2022'
is_match = re.match('the', 
                    text, 
                    re.I)
if is_match:
    print(f"'{is_match.group()}' appears at {is_match.span()}")
    
else:
    print(is_match) #None

'The' appears at (0, 3)


#### re.finditer(pattern, text)

In [9]:
text = 'There was further decline of the UHC'

match = re.finditer('the', text, 
                    flags=re.I)
list(match)

[<re.Match object; span=(0, 3), match='The'>,
 <re.Match object; span=(13, 16), match='the'>,
 <re.Match object; span=(29, 32), match='the'>]

#### re.sub(pattern, repl, text)

In [10]:
text = 'my pin is 4444'
re.sub('4', '*', text)

'my pin is ****'

#### re.split(pattern, text)

In [11]:
text = "wow! nice! love it! bye! "
re.split("!", text)

['wow', ' nice', ' love it', ' bye', ' ']

## Regex Metacharacters

#### . (dot) aka wildcard - any character except a new line

In [12]:
pattern = r'.'
print(re.findall(pattern,
        "Wow! We're now_25"))

['W', 'o', 'w', '!', ' ', 'W', 'e', "'", 'r', 'e', ' ', 'n', 'o', 'w', '_', '2', '5']


#### \w (lowercase) - Any alphanumeric character (letter, digit, or underscore)

In [13]:
pattern = r'\w'
print(re.findall(pattern,
        "Wow! We're now_25"))

['W', 'o', 'w', 'W', 'e', 'r', 'e', 'n', 'o', 'w', '_', '2', '5']


#### \W (uppercase) - anything that is not \w such as spaces, and special characters.

In [14]:
pattern = r'\W'
print(re.findall(pattern,
        "Wow! We're now_25"))

['!', ' ', "'", ' ']


#### \d - any digit, 0 to 9.

In [15]:
pattern = r'\d'
print(re.findall(pattern,
        "Wow! We're now_25"))

['2', '5']


#### \D - Any non-digit. Negates \d.

In [16]:
pattern = r'\D'
print(re.findall(pattern,
        "Wow! now_25"))

['W', 'o', 'w', '!', ' ', 'n', 'o', 'w', '_']


#### \s (lowercase s) - A white space.

In [17]:
pattern = r'\s'
print(re.findall(pattern,
        "Wow! We're now_25"))

[' ', ' ']


#### \S (uppercase s) - Negates \s. Returns anything that is not a white space.

In [18]:
pattern = r'\S'
print(re.findall(pattern,
        "Wow! Now_25"))

['W', 'o', 'w', '!', 'N', 'o', 'w', '_', '2', '5']


#### Character sets

#### [ ] - matches any of the characters inside the square brackets

In [19]:
pattern = r'[aeiou]'
print(re.findall(pattern,
        "Wow! We're now_25"))

['o', 'e', 'e', 'o']


#### [^ ] negates the character set. Opposite of the characters or range

In [20]:
#Any char except letters m to z
pattern = r'[^m-zM-Z]'
print(re.findall(pattern,
        "Wow! We're now_25"))

['!', ' ', 'e', "'", 'e', ' ', '_', '2', '5']


#### Repetition regex patterns

#### +(once or more)

In [21]:
#match o in hello once or many times
text = 'hell hello ago helloo hellooo'
pattern = r'hello+'

re.findall(pattern, text)

['hello', 'helloo', 'hellooo']

#### * (zero or more)

In [22]:
#match o in hello zero or many times
text = 'hell hello ago helloo hellooo'
pattern = r'hello*'

re.findall(pattern, text)

['hell', 'hello', 'helloo', 'hellooo']

#### ? (zero or once)

In [23]:
#match o in hello zero times or once
text = 'hell hello ago helloo hellooo'
pattern = r'hello?'

re.findall(pattern, text)

['hell', 'hello', 'hello', 'hello']

#### {n} -exact number of times to match

In [24]:
#Extract years
text = '7.6% in 2020 now 2022/23 budget'
pattern = r'\d{4}'

re.findall(pattern, text)

['2020', '2022']

#### {n,m} - minimum (n) and maximum (m) times to match

In [25]:
#Domain names;dot followed by 2 to 5 word chars
text = 'may@gmail.com cal@web.me ian@me.biz'

pattern = r'\.\w{2,5}'

re.findall(pattern, text)

['.com', '.me', '.biz']

#### {n, } - matches the previous element at least 'n' times.

In [26]:
#Long words
text = 'universal healthcare is low'
pattern = r'\w{5,}'
re.findall(pattern, text)

['universal', 'healthcare']

#### Greedy vs non-greedy qunatifiers

In [27]:
#greedy +
text = '45% 2020'
re.findall('\d+', text)

['45', '2020']

In [28]:
#non-greedy +
re.findall('\d+?', text)

['4', '5', '2', '0', '2', '0']

In [29]:
#greedy *
re.findall('\d*', text)

['45', '', '', '2020', '']

In [30]:
#non-greedy *
re.findall('\d*?', text)

['', '4', '', '5', '', '', '', '2', '', '0', '', '2', '', '0', '']

In [31]:
#greedy ?
re.findall('\d?', text)

['4', '5', '', '', '2', '0', '2', '0', '']

In [32]:
#non-greedy ?
re.findall('\d??', text)

['', '4', '', '5', '', '', '', '2', '', '0', '', '2', '', '0', '']

In [33]:
#greedy {n}
re.findall('\d{3}', text)

['202']

In [34]:
#non-greedy {n}
re.findall('\d{3}?', text)

['202']

In [35]:
#greedy {n,m}
re.findall('\d{1,2}', text)

['45', '20', '20']

In [36]:
#non-greedy {n,m}
re.findall('\d{1,2}?', text)

['4', '5', '2', '0', '2', '0']

#### Boundary/ anchors

#### ^ - matches only the start of a text

In [37]:
#Starts with two digits
text = '500,000 units'
pattern = r'^\d\d'
re.findall(pattern, text)

['50']

#### $ - matches the end of the string

In [38]:
#Ends with two digits
text = '500,000 units'
pattern = r'\d\d$'
re.findall(pattern, text)

[]

#### \b (word boundary)

In [39]:
pattern = r'\b'
re.findall(pattern,
           "Wow! We're now_25")

['', '', '', '', '', '', '', '']

In [40]:
pattern = r'\b'
re.sub(pattern, 
       '~',
       "Wow! We're now_25")

"~Wow~! ~We~'~re~ ~now_25~"

#### Groups

#### () - defines groups in a pattern

In [41]:
text = 'Yvonne worked for von'
pattern = r'(.o.)'
re.findall(pattern, text)

['von', 'wor', 'for', 'von']

 #### m.group() to access groups

In [42]:
text = 'this is @sue email sue@gmail.com'
pattern = r'(\w+)@(\w+)\.(\w+)\b'
m = re.search(pattern, text)
#match object
print(m)

#full match
print(m.group(0))

print(m.group(1))

print(m.group(2))

print(m.group(3))

<re.Match object; span=(19, 32), match='sue@gmail.com'>
sue@gmail.com
sue
gmail
com


#### \number to access groups

In [43]:
text = 'hello, we need 22 books'
pattern = r'(\w)\1'
list(re.finditer(pattern, text))

[<re.Match object; span=(2, 4), match='ll'>,
 <re.Match object; span=(11, 13), match='ee'>,
 <re.Match object; span=(15, 17), match='22'>,
 <re.Match object; span=(19, 21), match='oo'>]

#### Named groups

In [44]:
text = '08 Dec'
pattern = '(?P<day>\d{2})\s(?P<month>\w{3})'
m = re.search(pattern, text)
m.group('day')

'08'

#### ?: - Non-capturing groups

In [45]:
text = 'date 23 total 42% date 17 total 35%'
pattern = r'(\d+)(?:%)'
re.findall(pattern, text)

['42', '35']

#### | (or)

In [46]:
text = 'the sunny sun shines'
re.findall(r'sun|shine', text)

['sun', 'sun', 'shine']

## Pandas and regular expressions

In [47]:
import pandas as pd
df = pd.read_csv('titanic.csv')

### Filtering a dataframe using series.str.contains(pattern)

#### Task 1: Filter the dataframe to return rows where the ticket numbers had C and A.

In [48]:
pattern = r'C\.?A\.?'
mask = df['Ticket'].str.contains(pattern)
df[mask].head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S
56,57,1,2,"Rugg, Miss. Emily",female,21.0,0,0,C.A. 31026,10.5,,S
58,59,1,2,"West, Miss. Constance Mirium",female,5.0,1,2,C.A. 34651,27.75,,S
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S
66,67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S
70,71,0,2,"Jenkin, Mr. Stephen Curnow",male,32.0,0,0,C.A. 33111,10.5,,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
93,94,0,3,"Dean, Mr. Bertram Frank",male,26.0,1,2,C.A. 2315,20.575,,S
134,135,0,2,"Sobey, Mr. Samuel James Hayden",male,25.0,0,0,C.A. 29178,13.0,,S
145,146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S


In [49]:
mask.sum()

42

### Extracting data - s.str.extract(pattern)

#### Task 2: Extract all unique titles such as Mr, Miss, and Mrs from passenger names.

In [50]:
df['Name'].sample(3)

60        Sirayanian, Mr. Orsen
848           Harper, Rev. John
32     Glynn, Miss. Mary Agatha
Name: Name, dtype: object

In [51]:
pattern = '\s(\w+)\.'
all_ts = df['Name'].str.extract(
                    pattern, 
                    expand=False)
unique_ts = all_ts.value_counts()
unique_ts

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Countess      1
Lady          1
Sir           1
Mme           1
Ms            1
Jonkheer      1
Capt          1
Don           1
Name: Name, dtype: int64

#### Task 3a: From the 'Name' column, extract the titles, first names, and last names, and return them as columns in a new dataframe.

In [52]:
df['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [53]:
pattern = r'(\w+), (\w+\.) (\w+).*'
df_names = df['Name'].str.extract(
                            pattern, 
                            flags=re.I)
df_names

Unnamed: 0,0,1,2
0,Braund,Mr.,Owen
1,Cumings,Mrs.,John
2,Heikkinen,Miss.,Laina
3,Futrelle,Mrs.,Jacques
4,Allen,Mr.,William
...,...,...,...
886,Montvila,Rev.,Juozas
887,Graham,Miss.,Margaret
888,Johnston,Miss.,Catherine
889,Behr,Mr.,Karl


#### Task 3b: Clean up the dataframe above with named and ordered  columns.

In [54]:
pattern = r'(?P<lastname>\w+),\s(?P<title>\w+\.)\s(?P<firstname>\w+).*'
df_named = df['Name'].str.extract(
                            pattern,  
                            flags=re.I)
df_clean = df_named.reindex(
                        columns = 
                            ['title', 
                             'firstname',
                             'lastname'])
df_clean.head()

Unnamed: 0,title,firstname,lastname
0,Mr.,Owen,Braund
1,Mrs.,John,Cumings
2,Miss.,Laina,Heikkinen
3,Mrs.,Jacques,Futrelle
4,Mr.,William,Allen


### Replacing values in a column - s.str.replace(pattern, repl)

#### Task 4a: Replace all the titles with capital letters.

In [55]:
df['Name'].tail()

886                       Montvila, Rev. Juozas
887                Graham, Miss. Margaret Edith
888    Johnston, Miss. Catherine Helen "Carrie"
889                       Behr, Mr. Karl Howell
890                         Dooley, Mr. Patrick
Name: Name, dtype: object

In [56]:
pattern = r'\s(\w+)\. '
df['Name'].str.replace(pattern, 
            lambda m:m.group().upper()).tail()

886                       Montvila, REV. Juozas
887                Graham, MISS. Margaret Edith
888    Johnston, MISS. Catherine Helen "Carrie"
889                       Behr, MR. Karl Howell
890                         Dooley, MR. Patrick
Name: Name, dtype: object

#### Task 4b: Capitalize only Mr. and Mrs. titles.  In this case, we use | inside parentheses.

In [57]:
pattern = r'\s(Mr|Mrs)\.\s'
df['Name'].str.replace(pattern, 
            lambda m:m.group().upper(),
                      flags=re.I).tail()

886                       Montvila, Rev. Juozas
887                Graham, Miss. Margaret Edith
888    Johnston, Miss. Catherine Helen "Carrie"
889                       Behr, MR. Karl Howell
890                         Dooley, MR. Patrick
Name: Name, dtype: object

#### Task 5: Clean the dates in the column below by inserting  dashes to show the day, month, and year.

In [58]:
d = pd.DataFrame({'A': ['date is 12122022', 'date 02102021', 'Its the 05022020']})
d

Unnamed: 0,A
0,date is 12122022
1,date 02102021
2,Its the 05022020


In [59]:
pattern = r'(\d{2})(\d{2})(\d{4})'
d['A'] = d['A'].str.replace(pattern,
        lambda m: m.group(1)+'-'+
                   m.group(2)+'-'+
                   m.group(3))

In [60]:
d

Unnamed: 0,A
0,date is 12-12-2022
1,date 02-10-2021
2,Its the 05-02-2020


#### Task 6: Format the prices below to have a dollar sign and a comma for thousands

In [61]:
d = pd.DataFrame({'A': ['amount is 2340.50', 'amount is 10500', 'amount is 136.45']})
d

Unnamed: 0,A
0,amount is 2340.50
1,amount is 10500
2,amount is 136.45


In [62]:
pattern = '(\d+[\.\d+]*)'
d['A'].str.replace(pattern, lambda m: '$'+'{:,}'.format(float(m.group())))

0     amount is $2,340.5
1    amount is $10,500.0
2      amount is $136.45
Name: A, dtype: object