# Basic regex in Pandas

In [4]:
#IMPORT LIBRARIES
import pandas as pd
import re

In [5]:
## Create a series with random numbers
## row 1 and 3 have 2020 with a leading space
years = pd.Series(["2019 was pre-pandemic", " 2020 was a shocker",
                   "the first 234 days of 2019",
                   " 2020, in just 334", "info about 2019-20", 
                   "lost $453 in 1990-2000", "What will Winter 2022 be like?",
                   "Will Summer 2023 be scorcher?", 
                   "Devastating hurricane seasons included 2020-2022",
                  "The single worst season was 2022"])

In [6]:
## convert to dataframe with possible_period as column header
df = years.to_frame()
df.columns = (["possible_period"])
df

Unnamed: 0,possible_period
0,2019 was pre-pandemic
1,2020 was a shocker
2,the first 234 days of 2019
3,"2020, in just 334"
4,info about 2019-20
5,lost $453 in 1990-2000
6,What will Winter 2022 be like?
7,Will Summer 2023 be scorcher?
8,Devastating hurricane seasons included 2020-2022
9,The single worst season was 2022


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   possible_period  10 non-null     object
dtypes: object(1)
memory usage: 208.0+ bytes


## Goal

We want to create a column that holds all years that are in the following format:

- yyyy
- yyyy-yyyy
- Season yyyy
- yyyy-yy

## ```.str.match()```

```series.str.match()``` or ```df["column_header"].str.match()``` match ```regex``` patterns only at the very beginning of the string.

Often the least useful ```.str``` method.

In [8]:
## run str.match() on our series 
matches = years.str.match(r"\d{4}")
matches

0     True
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [9]:
## store match in column called match
## row 1 and 3 have 2020 with a leading space
df["match"] = df["possible_period"].str.match(r"\d{4}")
df

Unnamed: 0,possible_period,match
0,2019 was pre-pandemic,True
1,2020 was a shocker,False
2,the first 234 days of 2019,False
3,"2020, in just 334",False
4,info about 2019-20,False
5,lost $453 in 1990-2000,False
6,What will Winter 2022 be like?,False
7,Will Summer 2023 be scorcher?,False
8,Devastating hurricane seasons included 2020-2022,False
9,The single worst season was 2022,False


## ```str.contains()```

```series.str.contains()``` or ```df["column_header"].str.contains()``` match characters ```regex``` pattern anywhere in the string.

In [10]:
## STRING contains
df["contains"] =  df["possible_period"].str.contains(r"\d{4}")
df

Unnamed: 0,possible_period,match,contains
0,2019 was pre-pandemic,True,True
1,2020 was a shocker,False,True
2,the first 234 days of 2019,False,True
3,"2020, in just 334",False,True
4,info about 2019-20,False,True
5,lost $453 in 1990-2000,False,True
6,What will Winter 2022 be like?,False,True
7,Will Summer 2023 be scorcher?,False,True
8,Devastating hurricane seasons included 2020-2022,False,True
9,The single worst season was 2022,False,True


In [None]:
## contains yyyy pattern


```str.contains(regex_pattern, flags = re.I)```

You might also encounter: ```str.contains(regex_pattern, case = False)```

If ```case = True```, then case sensitive but if ```case = False``` not case sensitive.

The ```flags = ``` option is more versatile since you can use any regex flags.

In [12]:
## contains Season yyyy pattern
## if 'case = True', then case sensitive

df["contains_seasons"] = df["possible_period"].str.contains(r"(winter|summer|fall|spring)\s\d{4}", flags = re.I)
df

  df["contains_seasons"] = df["possible_period"].str.contains(r"(winter|summer|fall|spring)\s\d{4}", flags = re.I)


Unnamed: 0,possible_period,match,contains,contains_seasons
0,2019 was pre-pandemic,True,True,False
1,2020 was a shocker,False,True,False
2,the first 234 days of 2019,False,True,False
3,"2020, in just 334",False,True,False
4,info about 2019-20,False,True,False
5,lost $453 in 1990-2000,False,True,False
6,What will Winter 2022 be like?,False,True,True
7,Will Summer 2023 be scorcher?,False,True,True
8,Devastating hurricane seasons included 2020-2022,False,True,False
9,The single worst season was 2022,False,True,False


## ```.str.extract()```

Notice how with ```.str.contains()``` or ```.str.match()``` the result only tells us if something matched ())(()) a pattern.

To actually capture the characters that matched our pattern we use ```.str.extract()```.

In [16]:
### YYYY
df["yyyy-extract"] = df["possible_period"].str.extract(r"(\d{4})")
df

Unnamed: 0,possible_period,match,contains,contains_seasons,yyyy-extract
0,2019 was pre-pandemic,True,True,False,2019
1,2020 was a shocker,False,True,False,2020
2,the first 234 days of 2019,False,True,False,2019
3,"2020, in just 334",False,True,False,2020
4,info about 2019-20,False,True,False,2019
5,lost $453 in 1990-2000,False,True,False,1990
6,What will Winter 2022 be like?,False,True,True,2022
7,Will Summer 2023 be scorcher?,False,True,True,2023
8,Devastating hurricane seasons included 2020-2022,False,True,False,2020
9,The single worst season was 2022,False,True,False,2022


In [23]:
### YYYY-YY
df["hyphen-extract"] = df["possible_period"].str.extract(r"(\d{4}-\d{2,4})")
df

Unnamed: 0,possible_period,match,contains,contains_seasons,yyyy-extract,w_extract,seasons-extract,hyphen-extract
0,2019 was pre-pandemic,True,True,False,2019,,,
1,2020 was a shocker,False,True,False,2020,,,
2,the first 234 days of 2019,False,True,False,2019,,,
3,"2020, in just 334",False,True,False,2020,,,
4,info about 2019-20,False,True,False,2019,,,2019-20
5,lost $453 in 1990-2000,False,True,False,1990,,,1990-2000
6,What will Winter 2022 be like?,False,True,True,2022,Winter 2022,Winter 2022,
7,Will Summer 2023 be scorcher?,False,True,True,2023,,Summer 2023,
8,Devastating hurricane seasons included 2020-2022,False,True,False,2020,,,2020-2022
9,The single worst season was 2022,False,True,False,2022,,,


In [19]:
### find Winter YYYY
df["w_extract"] = df["possible_period"].str.extract(r"(winter\s\d{4})")
df

Unnamed: 0,possible_period,match,contains,contains_seasons,yyyy-extract,w_extract
0,2019 was pre-pandemic,True,True,False,2019,
1,2020 was a shocker,False,True,False,2020,
2,the first 234 days of 2019,False,True,False,2019,
3,"2020, in just 334",False,True,False,2020,
4,info about 2019-20,False,True,False,2019,
5,lost $453 in 1990-2000,False,True,False,1990,
6,What will Winter 2022 be like?,False,True,True,2022,
7,Will Summer 2023 be scorcher?,False,True,True,2023,
8,Devastating hurricane seasons included 2020-2022,False,True,False,2020,
9,The single worst season was 2022,False,True,False,2022,


In [20]:
### find Winter YYYY
## add a flag
df["w_extract"] = df["possible_period"].str.extract(r"(winter\s\d{4})", flags = re.I)
df

Unnamed: 0,possible_period,match,contains,contains_seasons,yyyy-extract,w_extract
0,2019 was pre-pandemic,True,True,False,2019,
1,2020 was a shocker,False,True,False,2020,
2,the first 234 days of 2019,False,True,False,2019,
3,"2020, in just 334",False,True,False,2020,
4,info about 2019-20,False,True,False,2019,
5,lost $453 in 1990-2000,False,True,False,1990,
6,What will Winter 2022 be like?,False,True,True,2022,Winter 2022
7,Will Summer 2023 be scorcher?,False,True,True,2023,
8,Devastating hurricane seasons included 2020-2022,False,True,False,2020,
9,The single worst season was 2022,False,True,False,2022,


In [22]:
### find both Summer YYYY and Winter YYYY
## add a flag
df["seasons-extract"] = df["possible_period"]\
.str.extract(r"((?:winter|summer|fall|spring)\s\d{4})", flags = re.I)
df

Unnamed: 0,possible_period,match,contains,contains_seasons,yyyy-extract,w_extract,seasons-extract
0,2019 was pre-pandemic,True,True,False,2019,,
1,2020 was a shocker,False,True,False,2020,,
2,the first 234 days of 2019,False,True,False,2019,,
3,"2020, in just 334",False,True,False,2020,,
4,info about 2019-20,False,True,False,2019,,
5,lost $453 in 1990-2000,False,True,False,1990,,
6,What will Winter 2022 be like?,False,True,True,2022,Winter 2022,Winter 2022
7,Will Summer 2023 be scorcher?,False,True,True,2023,,Summer 2023
8,Devastating hurricane seasons included 2020-2022,False,True,False,2020,,
9,The single worst season was 2022,False,True,False,2022,,


In [24]:
## convert to dataframe with possible_period as column header
df = years.to_frame()
df.columns = (["possible_period"])
df

Unnamed: 0,possible_period
0,2019 was pre-pandemic
1,2020 was a shocker
2,the first 234 days of 2019
3,"2020, in just 334"
4,info about 2019-20
5,lost $453 in 1990-2000
6,What will Winter 2022 be like?
7,Will Summer 2023 be scorcher?
8,Devastating hurricane seasons included 2020-2022
9,The single worst season was 2022


In [25]:
## Let's pull all the patterns one
## pattern to capture summer and winter followed by the year but not other text
regex_pat = r"((\d{4}-\d{2,4})|(\d{4})|((winter|summer|fall|spring)\s\d{4}))"
regex_pat

'((\\d{4}-\\d{2,4})|(\\d{4})|((winter|summer|fall|spring)\\s\\d{4}))'

In [26]:
## run pattern on possible_period
final = df["possible_period"].str.extract(regex_pat, flags = re.I)
final

Unnamed: 0,0,1,2,3,4
0,2019,,2019.0,,
1,2020,,2020.0,,
2,2019,,2019.0,,
3,2020,,2020.0,,
4,2019-20,2019-20,,,
5,1990-2000,1990-2000,,,
6,Winter 2022,,,Winter 2022,Winter
7,Summer 2023,,,Summer 2023,Summer
8,2020-2022,2020-2022,,,
9,2022,,2022.0,,


In [27]:
## call first column
final[[0]]

Unnamed: 0,0
0,2019
1,2020
2,2019
3,2020
4,2019-20
5,1990-2000
6,Winter 2022
7,Summer 2023
8,2020-2022
9,2022


## Pattern hard to read:

```r'((\d{4}-\d{4})|(\d{4}-\d{2}\b)|(\d{4})|((winter|summer)\s\d{4}))'```

Let's try another approach:

In [38]:
first= "sandeep"
last = "junnarkar"
space = " "

In [39]:
first + last

'sandeepjunnarkar'

In [40]:
first + "****" + last

'sandeep****junnarkar'

In [41]:
first + space + last

'sandeep junnarkar'

In [28]:
## Let's pull all the patterns into a list
## pattern to capture summer and winter followed by the year but not other text
regex_pat = [r'((?:winter|summer|fall|spring)\s\d{4})',
             r'(\d{4}-\d{2,4})',
             r'\d{4}'
    
]

regex_pat

['((?:winter|summer|fall|spring)\\s\\d{4})', '(\\d{4}-\\d{2,4})', '\\d{4}']

### ```str.extract('|'.join(regex))```

This allows us to iterate through our list of patterns with ```|``` denoting alternates.

In [30]:
## step in the right direction:
df["possible_period"].str.extract("|".join(regex_pat))

Unnamed: 0,0,1
0,,
1,,
2,,
3,,
4,,2019-20
5,,1990-2000
6,,
7,,
8,,2020-2022
9,,


### Missing items:

- flags
- capture grouping

In [32]:
## create a single capture group with correct flags
df["possible_period"].str.extract("("+"|".join(regex_pat)+")", flags = re.I)

Unnamed: 0,0,1,2
0,2019,,
1,2020,,
2,2019,,
3,2020,,
4,2019-20,,2019-20
5,1990-2000,,1990-2000
6,Winter 2022,Winter 2022,
7,Summer 2023,Summer 2023,
8,2020-2022,,2020-2022
9,2022,,


In [42]:
## store it into a variable:

final = df["possible_period"].str.extract("("+"|".join(regex_pat)+")", flags = re.I)
final

Unnamed: 0,0,1,2
0,2019,,
1,2020,,
2,2019,,
3,2020,,
4,2019-20,,2019-20
5,1990-2000,,1990-2000
6,Winter 2022,Winter 2022,
7,Summer 2023,Summer 2023,
8,2020-2022,,2020-2022
9,2022,,


In [45]:
final[[0]]

Unnamed: 0,0
0,2019
1,2020
2,2019
3,2020
4,2019-20
5,1990-2000
6,Winter 2022
7,Summer 2023
8,2020-2022
9,2022


In [43]:
df

Unnamed: 0,possible_period
0,2019 was pre-pandemic
1,2020 was a shocker
2,the first 234 days of 2019
3,"2020, in just 334"
4,info about 2019-20
5,lost $453 in 1990-2000
6,What will Winter 2022 be like?
7,Will Summer 2023 be scorcher?
8,Devastating hurricane seasons included 2020-2022
9,The single worst season was 2022


In [46]:
## add the first column into our main df
## drop all the intermediate demo columns
## call our final df
df["target_periods"] = final[0]
df

Unnamed: 0,possible_period,target_periods
0,2019 was pre-pandemic,2019
1,2020 was a shocker,2020
2,the first 234 days of 2019,2019
3,"2020, in just 334",2020
4,info about 2019-20,2019-20
5,lost $453 in 1990-2000,1990-2000
6,What will Winter 2022 be like?,Winter 2022
7,Will Summer 2023 be scorcher?,Summer 2023
8,Devastating hurricane seasons included 2020-2022,2020-2022
9,The single worst season was 2022,2022
