# Imports

In [2]:
## imports
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [5]:
schools_df = pd.read_csv("../../../public_data/schools_df.csv")
schools_df.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
0,stove prairie elementary school,0.0,N,True,False,False
1,stewart county elementary school,0.7603,Y,True,False,False
2,desert springs elementary school,,N,True,False,False
3,saunemin elem school,0.3893999999999999,N,True,False,False
4,fifth district elementary,0.0275,N,True,False,False


# 1. re.sub illustration

**Task**: 

- Use the dataset: `school_df` and filter to `is_elem_exercise` == True 
- Using the `schoolname` field, replace the different varieties of elementary school in the data with `elemschool` 

## Incorrect approach 

Returns incorrect results that we'll see below

In [6]:
elem_ex = schools_df[schools_df.is_elem_exercise].copy()
elem_pattern = r"elementary|elem|elem\\.|elementary school"

new_schools = [re.sub(elem_pattern, "elemschool", school) 
               for school in elem_ex.schoolname]

elem_ex['cleaned_name_try1'] = new_schools
elem_ex[["schoolname", "cleaned_name_try1"]]

Unnamed: 0,schoolname,cleaned_name_try1
0,stove prairie elementary school,stove prairie elemschool school
1,stewart county elementary school,stewart county elemschool school
2,desert springs elementary school,desert springs elemschool school
3,saunemin elem school,saunemin elemschool school
4,fifth district elementary,fifth district elemschool
5,paint branch elementary,paint branch elemschool
6,oak hill elem.,oak hill elemschool.
7,lewis and clark elem.,lewis and clark elemschool.
8,linden elementary school,linden elemschool school
9,winchester avenue elementary school,winchester avenue elemschool school


## A correct approach

Addresses issues above with `elementary school` and `elem.`

In [7]:
elem_pattern_try2 = r"(elem.*)(\s+)?(school)?"
    
new_schools_try2 = [re.sub(elem_pattern_try2, "elemschool", school) 
                   for school in elem_ex.schoolname]    

elem_ex['cleaned_name_try2'] = new_schools_try2
elem_ex[["schoolname", "cleaned_name_try1", "cleaned_name_try2"]]

Unnamed: 0,schoolname,cleaned_name_try1,cleaned_name_try2
0,stove prairie elementary school,stove prairie elemschool school,stove prairie elemschool
1,stewart county elementary school,stewart county elemschool school,stewart county elemschool
2,desert springs elementary school,desert springs elemschool school,desert springs elemschool
3,saunemin elem school,saunemin elemschool school,saunemin elemschool
4,fifth district elementary,fifth district elemschool,fifth district elemschool
5,paint branch elementary,paint branch elemschool,paint branch elemschool
6,oak hill elem.,oak hill elemschool.,oak hill elemschool
7,lewis and clark elem.,lewis and clark elemschool.,lewis and clark elemschool
8,linden elementary school,linden elemschool school,linden elemschool
9,winchester avenue elementary school,winchester avenue elemschool school,winchester avenue elemschool


# 2. re.findall and re.search illustrations

**Task**: 

- Filter to `is_charter_exercise` == True; note that this contains a mix of schools with charter in the name and schools without
- Construct a pattern that, for charter schools, gets the school name prior to appearance of the word charter. School names without charter will not have matches (so Bethesda Charter becomes Bethesda; Bethesda High stays Bethesda High)


## re.findall 

In [11]:
## filter to charter exercise
charter_ex = schools_df[schools_df.is_charter_exercise].copy()

## charter pattern
charter_pattern = r"(.*)\s+(charter)(\s+)?(\w+)?"

## findall 
test_charter_findall = [re.findall(charter_pattern, 
                    school) for school in charter_ex.schoolname]

## print result
test_charter_findall

[[],
 [('life source international', 'charter', '', '')],
 [],
 [("children's community", 'charter', '', '')],
 [],
 [('thomas edison', 'charter', ' ', 'academy')],
 [('moving everest', 'charter', ' ', 'school')],
 [],
 [],
 [('south valley academy', 'charter', ' ', 'school')],
 [('brighter choice', 'charter', ' ', 'school')],
 [('buffalo collegiate', 'charter', ' ', 'school')],
 [('neighborhood', 'charter', ' ', 'school')],
 [],
 [],
 []]

In [9]:
## show example of one
print(test_charter_findall[1][0][0])

life source international


## re.search

In [12]:
## get matches
test_charter_search = [re.search(charter_pattern, 
                    school) for school in charter_ex.schoolname]

test_charter_search


[None,
 <re.Match object; span=(0, 33), match='life source international charter'>,
 None,
 <re.Match object; span=(0, 28), match="children's community charter">,
 None,
 <re.Match object; span=(0, 29), match='thomas edison charter academy'>,
 <re.Match object; span=(0, 29), match='moving everest charter school'>,
 None,
 None,
 <re.Match object; span=(0, 35), match='south valley academy charter school'>,
 <re.Match object; span=(0, 30), match='brighter choice charter school'>,
 <re.Match object; span=(0, 33), match='buffalo collegiate charter school'>,
 <re.Match object; span=(0, 27), match='neighborhood charter school'>,
 None,
 None,
 None]

In [14]:
## extract matches

### here, we're just focusing on the 3rd match or the 5th entry (thomas edison charter academy)
### and we're getting the first group from that match
thomas_match = test_charter_search[5]
thomas_match

### example where we're just getting the first group
### (name of school before charter)
thomas_firstgroup = thomas_match.group(1)
thomas_firstgroup


<re.Match object; span=(0, 29), match='thomas edison charter academy'>

'thomas edison'

In [15]:
### iterate over all groups and print
for i in range(0, len(thomas_match.groups())+1):
    print("Group " + str(i) + " is: ")
    print(thomas_match.group(i))

## see error if we go beyond actual number of 
## groups thomas_match.group(5)

Group 0 is: 
thomas edison charter academy
Group 1 is: 
thomas edison
Group 2 is: 
charter
Group 3 is: 
 
Group 4 is: 
academy


In [16]:
## can also extract the groups as a tuple
## example- want to return group 1 and group 2 and paste together
thomas_groups_all = thomas_match.groups()
thomas_groups_all

## slice the tuple
thomas_groups_all[0:2]


('thomas edison', 'charter', ' ', 'academy')

('thomas edison', 'charter')

In [17]:
## can generalize to the full list with ifelse
def get_precharter_name(one_matchobj):
    
    if one_matchobj:
        school_name = one_matchobj.group(1)
    else:
        school_name = ""
    
    return(school_name)

all_charter_match = [get_precharter_name(one_search) 
                    for one_search in test_charter_search]

all_charter_match

['',
 'life source international',
 '',
 "children's community",
 '',
 'thomas edison',
 'moving everest',
 '',
 '',
 'south valley academy',
 'brighter choice',
 'buffalo collegiate',
 'neighborhood',
 '',
 '',
 '']

# 3. Practice for you 

- Filter the data to `is_highschool_exercise` is True 
- You want to find the names of high schools. Try out some patterns to standardize the high school names (e.g., `high school` and `high` could both become `highschool`). Make sure, for instance, that for this high school name - `kings county office of education highland facility`- the pattern doesn't replace highland with high school. Don't worry about junior versus senior distinctions
- Then, using some example results, try writing a regex pattern and using re.match to get the name of the school that precedes the `highschool` part of the name (e.g., `new trier highschool` -> `new trier`)


### Standardizing high school name

In [18]:
hs_examples = schools_df[schools_df.is_highschool_exercise].copy()

hs_examples.schoolname

hs_pat = r" hs|highschool|high\s+school|high(?!\w)"
hs_clean = [re.sub(hs_pat, "highschool", one_school)
           for one_school in hs_examples.schoolname]
hs_clean

hs_pat2 = r"((\shigh\s|\shigh$|\shs.*)(\s+)?(school)?)"
hs_clean2 = [re.sub(hs_pat2, " highschool", one_school)
           for one_school in hs_examples.schoolname]
hs_clean2


26                             mount pleasant area jshs
27                                    huron high school
28                                  thomson high school
29    kings county office of education highland faci...
30                                     clovis east high
31                               camden jr. high school
32                                  jackson junior high
33                            emmett junior high school
34                                          atkins high
35                                lexington senior high
36                                            temple hs
37                              forest hill high school
38                                      pittsfield high
39                                 matanzas high school
40                                  pontiac high school
Name: schoolname, dtype: object

['mount pleasant area jshs',
 'huron highschool',
 'thomson highschool',
 'kings county office of education highland facility',
 'clovis east highschool',
 'camden jr. highschool',
 'jackson junior highschool',
 'emmett junior highschool',
 'atkins highschool',
 'lexington senior highschool',
 'templehighschool',
 'forest hill highschool',
 'pittsfield highschool',
 'matanzas highschool',
 'pontiac highschool']

['mount pleasant area jshs',
 'huron highschool',
 'thomson highschool',
 'kings county office of education highland facility',
 'clovis east highschool',
 'camden jr. highschool',
 'jackson junior highschool',
 'emmett junior highschool',
 'atkins highschool',
 'lexington senior highschool',
 'temple highschool',
 'forest hill highschool',
 'pittsfield highschool',
 'matanzas highschool',
 'pontiac highschool']

<class 'list'>


In [24]:
# for now, ignoring jr/senior distinction
## and matching on high school, high, and hs

## to avoid matching things like highland, 
## after high or hs, add (\s|$) that tells it 
## to either look for a space or look for the 
## end of the string
hs_sub_pattern = r"(\shigh(\s|$)|\shs(\s|$)?)(\s+)?(school)?"
test_pat_examples = [re.sub(hs_sub_pattern, " highschool", example) 
                    for example in hs_examples.schoolname]

test_pat_examples

['mount pleasant area jshs',
 'huron highschool',
 'thomson highschool',
 'kings county office of education highland facility',
 'clovis east highschool',
 'camden jr. highschool',
 'jackson junior highschool',
 'emmett junior highschool',
 'atkins highschool',
 'lexington senior highschool',
 'temple highschool',
 'forest hill highschool',
 'pittsfield highschool',
 'matanzas highschool',
 'pontiac highschool']

In [25]:
### apply over all and assign as a new column
### since we're pulling from original df
### casting it to string since was object
hs_clean_all = [re.sub(hs_sub_pattern, " highschool", str(oneschool)) 
                    for oneschool in hs_examples.schoolname.to_list()]


### assign as col
hs_examples['school_cleanhs'] = hs_clean_all

hs_examples[["schoolname", "school_cleanhs"]]

Unnamed: 0,schoolname,school_cleanhs
26,mount pleasant area jshs,mount pleasant area jshs
27,huron high school,huron highschool
28,thomson high school,thomson highschool
29,kings county office of education highland faci...,kings county office of education highland faci...
30,clovis east high,clovis east highschool
31,camden jr. high school,camden jr. highschool
32,jackson junior high,jackson junior highschool
33,emmett junior high school,emmett junior highschool
34,atkins high,atkins highschool
35,lexington senior high,lexington senior highschool


### With some examples, pulling out name of school before high

In [26]:
## using the test_pat_examples and want to get things like huron, thomson, clovis east
prehs_pattern = r"(.*)\s+(highschool)(\s+)?(\w+)?"

schoolname_preh_matchobj  = [re.search(prehs_pattern, 
                    school) for school in test_pat_examples]

schoolname_preh_matchobj

## get the first group if exists; else return empty string
schoolname_preh = [obj.group(1) if obj else "" 
                   for obj in schoolname_preh_matchobj]
schoolname_preh

[None,
 <re.Match object; span=(0, 16), match='huron highschool'>,
 <re.Match object; span=(0, 18), match='thomson highschool'>,
 None,
 <re.Match object; span=(0, 22), match='clovis east highschool'>,
 <re.Match object; span=(0, 21), match='camden jr. highschool'>,
 <re.Match object; span=(0, 25), match='jackson junior highschool'>,
 <re.Match object; span=(0, 24), match='emmett junior highschool'>,
 <re.Match object; span=(0, 17), match='atkins highschool'>,
 <re.Match object; span=(0, 27), match='lexington senior highschool'>,
 <re.Match object; span=(0, 17), match='temple highschool'>,
 <re.Match object; span=(0, 22), match='forest hill highschool'>,
 <re.Match object; span=(0, 21), match='pittsfield highschool'>,
 <re.Match object; span=(0, 19), match='matanzas highschool'>,
 <re.Match object; span=(0, 18), match='pontiac highschool'>]

['',
 'huron',
 'thomson',
 '',
 'clovis east',
 'camden jr.',
 'jackson junior',
 'emmett junior',
 'atkins',
 'lexington senior',
 'temple',
 'forest hill',
 'pittsfield',
 'matanzas',
 'pontiac']