In [1]:
from typing import List
import logging
import functools
import operator
import numpy as np
import pandas as pd

In [2]:
#pandas lets us apply basic string and regular expressions methods on series of data.
#Built-in Python string methods
val = 'a,b, guido'
val

'a,b, guido'

In [3]:
val.split(',')

['a', 'b', ' guido']

In [4]:
#combine strip with split
pieces : List[str] = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [5]:
#unpack
first, second, third = pieces
first, second, third

('a', 'b', 'guido')

In [6]:
first + '::' + second + '::' + third 

'a::b::guido'

In [7]:
#join works better
'::'.join(pieces)

'a::b::guido'

In [8]:
functools.reduce(operator.add, [i+'::' for i in pieces])
#still shows two extra colons at the end

'a::b::guido::'

In [9]:
#in, index, find can find a location of a substring
'guido' in val

True

In [10]:
val.index(',')

1

In [11]:
val.find(':')

-1

In [12]:
#difference between find and index
try:
    val.index(':')
except ValueError as e:
    logging.critical(e)

CRITICAL:root:substring not found


In [13]:
#count: number of occurrences of a substring
val.count(',')

2

In [14]:
#replace: replace one substring with another, we can also use this to replace a full string
#with an empty string
val.replace(',', '::')

'a::b:: guido'

In [15]:
val.replace(',', '')

'ab guido'

In [16]:
#casefold
val.casefold()

'a,b, guido'

In [17]:
val.index(',')

1

In [18]:
val.find(',')

1

In [19]:
val.rfind(',')

3

In [20]:
#regular expressions
import re

In [21]:
text = 'foo   bar\t baz  \tqux'

In [22]:
#\s+ splits on any consecutive whitespace sequence
#the regular expression is compiled first here
re.split(r'\s+', text)

['foo', 'bar', 'baz', 'qux']

In [23]:
%time
#manually compile regular expressions
regex = re.compile(r'\s+')
regex.split(text)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 2.86 µs


['foo', 'bar', 'baz', 'qux']

In [24]:
%time
#same thing
re.split(regex, text)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.29 µs


['foo', 'bar', 'baz', 'qux']

In [25]:
#find all occurrences of the regex pattern /s+
regex.findall(text)

['   ', '\t ', '  \t']

In [26]:
#same thing
re.findall(regex, text)

['   ', '\t ', '  \t']

In [32]:
#! ipython verbatim id=8f2d09408b41491aa269044fc4375d93
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [35]:
#re.IGNORECASE makes regex case insensitive
regex=re.compile(pattern, flags=re.IGNORECASE)
regex

re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

In [37]:
#findall returns a list of email addresses
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [38]:
re.findall(regex, text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [39]:
#search returns only the first match
#search returns a regex match object for the first email address
m = regex.search(text)
m

In [40]:
text[m.start():m.end()]

'dave@google.com'

In [42]:
#match returns only a match at the beginning of the string
print(regex.match(text)) #the pattern does not occur at the beginning of the string

None


In [43]:
#but let's remove the text before the mattern
print(regex.match(text[m.start():]))

<re.Match object; span=(0, 15), match='dave@google.com'>


In [44]:
#sub: returns a new string with the pattern replaced by a new string
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [46]:
#same thing
print(re.sub(regex, 'REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [None]:
#put parentheses around the parts of the pattern to segment
#for example splitting email addresses into their component parts
