In [1]:
from typing import Dict, List
import logging
import functools
import operator
import numpy as np
import pandas as pd

In [2]:
#pandas lets us apply basic string and regular expressions methods on series of data.
#Built-in Python string methods
val = 'a,b, guido'
val

'a,b, guido'

In [3]:
val.split(',')

['a', 'b', ' guido']

In [4]:
#combine strip with split
pieces : List[str] = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [5]:
#unpack
first, second, third = pieces
first, second, third

('a', 'b', 'guido')

In [6]:
first + '::' + second + '::' + third 

'a::b::guido'

In [7]:
#join works better
'::'.join(pieces)

'a::b::guido'

In [8]:
functools.reduce(operator.add, [i+'::' for i in pieces])
#still shows two extra colons at the end

'a::b::guido::'

In [9]:
#in, index, find can find a location of a substring
'guido' in val

True

In [10]:
val.index(',')

1

In [11]:
val.find(':')

-1

In [12]:
#difference between find and index
try:
    val.index(':')
except ValueError as e:
    logging.critical(e)

CRITICAL:root:substring not found


In [13]:
#count: number of occurrences of a substring
val.count(',')

2

In [14]:
#replace: replace one substring with another, we can also use this to replace a full string
#with an empty string
val.replace(',', '::')

'a::b:: guido'

In [15]:
val.replace(',', '')

'ab guido'

In [16]:
#casefold
val.casefold()

'a,b, guido'

In [17]:
val.index(',')

1

In [18]:
val.find(',')

1

In [19]:
val.rfind(',')

3

In [20]:
#regular expressions
import re

In [21]:
text = 'foo   bar\t baz  \tqux'

In [22]:
#\s+ splits on any consecutive whitespace sequence
#the regular expression is compiled first here
re.split(r'\s+', text)

['foo', 'bar', 'baz', 'qux']

In [23]:
%time
#manually compile regular expressions
regex = re.compile(r'\s+')
regex.split(text)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.05 µs


['foo', 'bar', 'baz', 'qux']

In [24]:
%time
#same thing
re.split(regex, text)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 4.29 µs


['foo', 'bar', 'baz', 'qux']

In [25]:
#find all occurrences of the regex pattern /s+
regex.findall(text)

['   ', '\t ', '  \t']

In [26]:
#same thing
re.findall(regex, text)

['   ', '\t ', '  \t']

In [27]:
#! ipython verbatim id=8f2d09408b41491aa269044fc4375d93
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [28]:
#re.IGNORECASE makes regex case insensitive
regex=re.compile(pattern, flags=re.IGNORECASE)
regex

re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

In [29]:
#findall returns a list of email addresses
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [30]:
re.findall(regex, text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [31]:
#search returns only the first match
#search returns a regex match object for the first email address
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [32]:
text[m.start():m.end()]

'dave@google.com'

In [33]:
#match returns only a match at the beginning of the string
print(regex.match(text)) #the pattern does not occur at the beginning of the string

None


In [34]:
#but let's remove the text before the mattern
print(regex.match(text[m.start():]))

<re.Match object; span=(0, 15), match='dave@google.com'>


In [35]:
#sub: returns a new string with the pattern replaced by a new string
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [36]:
#same thing
print(re.sub(regex, 'REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [37]:
#put parentheses around the parts of the pattern to segment
#for example splitting email addresses into their component parts
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [38]:
regex = re.compile(pattern, flags=re.IGNORECASE)
regex

re.compile(r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})',
           re.IGNORECASE|re.UNICODE)

In [39]:
#a match object produced by the modified regex returns a tupple of the pattern of components
#using its groups method
m = regex.match('wesm@bright.net')
m

<re.Match object; span=(0, 15), match='wesm@bright.net'>

In [40]:
m.groups()

('wesm', 'bright', 'net')

In [41]:
#findall returns a list of tuples when the pattern has groups
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [42]:
#sub can also match groups with special symbols, like \1 and \2 for the first and second groups 
#respectively
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


In [43]:
#sub can also match groups with special symbols, like \1 and \2 for the first and second groups 
#respectively
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


In [44]:
#finditer
for i in regex.finditer(text):
    print(i)

<re.Match object; span=(5, 20), match='dave@google.com'>
<re.Match object; span=(27, 42), match='steve@gmail.com'>
<re.Match object; span=(47, 60), match='rob@gmail.com'>
<re.Match object; span=(66, 80), match='ryan@yahoo.com'>


In [45]:
#subn
re.subn('t', '', 'test', count=1)

('est', 1)

In [46]:
#pandas string functions
#cleaning datasets: A LOT of string manipulation goes into it
#columns with string dtypes might also have missing data
data : Dict = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
              'Rob': 'rob@gmail.com', 'Wes': np.nan}
data

{'Dave': 'dave@google.com',
 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com',
 'Wes': nan}

In [47]:
data = pd.Series(data)

In [48]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [49]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [50]:
pd.isna(data)

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [51]:
pd.notna(data)

Dave      True
Steve     True
Rob       True
Wes      False
dtype: bool

In [53]:
#data.map will fail on NA values
#series.str methods skip over and propigate NaN values
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [54]:
#above we have the object dtype. We can also make use of the string dtype.
data_as_string_ext = data.astype('string')

In [55]:
data_as_string_ext

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [56]:
#see if it contains 'gmail'
data_as_string_ext.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [57]:
#we can also use regular expressions and their options as well
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [59]:
#find examples of email addresses
data.str.findall(pattern) #with no re.IGNORECASE we get a bunch of empty lists

Dave      []
Steve     []
Rob       []
Wes      NaN
dtype: object

In [60]:
#find examples of email addresses
data.str.findall(pattern, flags=re.IGNORECASE) #with no re.IGNORECASE we get a bunch of empty lists

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [63]:
#vectorized element retrieval
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
matches

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [64]:
#1) str.get
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [65]:
#2) index the str attribute
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [66]:
#str.extract: return captured groups of a regular expression as a DataFrame
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,
