In [1]:
import pandas as pd
import numpy as np

s = pd.Series(
    [
        "0",
        "John Wood",
        "Colin Welsh",
        "my list",
        "02456",
        np.nan,
        "HELLO WORLD",
        "water%",
    ]
)

In [2]:
s.str.contains("John")

0    False
1     True
2    False
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [3]:
s.str.contains("John") | s.str.contains("Colin")

0    False
1     True
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [5]:
# shorter version

s.str.contains("John|Colin")

0    False
1     True
2     True
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [9]:
s2 = pd.Series(["bar", "sugar", "cartoon", "argon"])
s2

0        bar
1      sugar
2    cartoon
3      argon
dtype: object

In [10]:
# function looking at any word xar

s2.str.contains(".ar")

0     True
1     True
2     True
3    False
dtype: bool

In [11]:
# [] inside brackets for matching characters

s2.str.contains("[bc]ar")

# any word that contains bar or car

0     True
1    False
2     True
3    False
dtype: bool

In [15]:
# [a-z] = match any lower case letter, [A-Z] upper, [0-9] any digit, [a-zA-Z0-9] match any letter or digit

# adding ^ matches any characters not matching

print(s[s.str.contains("[0-9]", na=False)])
print(s[s.str.contains("[^0-9]", na=False)])

0        0
4    02456
dtype: object
1      John Wood
2    Colin Welsh
3        my list
6    HELLO WORLD
7         water%
dtype: object


\d - match any digit
\D - match any non-digit
\w - match any alphanumeric character (letter or digit) or an underscore (_)
\W - match any character that is not alphanumeric or an underscore as described above
\s - match whitespace (spaces, tabs, newlines, etc.)
\S - match non-whitespace

In [18]:
# all strings containing a digit

s[s.str.contains("[\d]", na=False)]

0        0
4    02456
dtype: object

In [20]:
# ^ match at beginning of string, $ match at end of string

s2[s2.str.contains("^[bc]", na=False)]

0        bar
2    cartoon
dtype: object

In [21]:
s2[s2.str.contains("ar$", na=False)]

0      bar
1    sugar
dtype: object

* - match zero or more copies of the preceding character
? - match zero or 1 copy of the preceding character
+ - match 1 or more copies of the preceding character
Or we can use curly braces to specify how many times we want to match the given character. We have the following choices:

{m} - match the preceding element m times
{m,} - match the preceding element m times or more
{m,n} - match the preceding element between m and n times

In [22]:
s3 = pd.Series(["forest", "o", "ff", "foo", "fof"])
s3.str.contains("f+o?f+")

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [23]:
s4 = pd.Series(["Monday5km", "Wednesday10km", "Saturday25km"])

In [26]:
# Extract weekday names in a new column
s4.str.extract("(\w+day)", expand=True)

Unnamed: 0,0
0,Monday
1,Wednesday
2,Saturday


In [27]:
# Extract weekday names and distances in km in separate columns
s4.str.extract("(\w+day)(\d+km)", expand=True)

Unnamed: 0,0,1
0,Monday,5km
1,Wednesday,10km
2,Saturday,25km


In [28]:
# Define string sample
sample = 'Monday5km'
sample

'Monday5km'

In [29]:
# Import re library
import re

# Match groups according to regex pattern
m = re.match('(\w+day)(\d+km)', # regex pattern
             sample              # string sample
            )

# Show matched groups
m.groups()

('Monday', '5km')

In [30]:
m.groups()[0]

'Monday'

In [31]:
m.groups()[0][:3]

'Mon'

In [32]:
def f(x):
    return x.groups()[0][:3]

In [33]:
s4.str.replace("(\w+day)",
               f,           
               regex=True
              )

0     Mon5km
1    Wed10km
2    Sat25km
dtype: object