# Imports

In [1]:
## imports
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [2]:
schools_df = pd.read_csv("../public_data/schools_df.csv")
schools_df.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
0,stove prairie elementary school,0.0,N,True,False,False
1,stewart county elementary school,0.7603,Y,True,False,False
2,desert springs elementary school,,N,True,False,False
3,saunemin elem school,0.3893999999999999,N,True,False,False
4,fifth district elementary,0.0275,N,True,False,False


# re.sub illustration

**Task**: 

- Use the `school_df` dataset and filter to `is_elem_exercise` == True 
- Using the `schoolname` field, replace the different varieties of elementary school in the data with `elemschool` 

## Incorrect approach 

Returns incorrect results that we'll see below

In [3]:
elem_ex = schools_df[schools_df.is_elem_exercise].copy()
elem_ex.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
0,stove prairie elementary school,0.0,N,True,False,False
1,stewart county elementary school,0.7603,Y,True,False,False
2,desert springs elementary school,,N,True,False,False
3,saunemin elem school,0.3893999999999999,N,True,False,False
4,fifth district elementary,0.0275,N,True,False,False


In [None]:
elem_pattern = r"elementary|elem|elem\.|elementary school"

new_schools = [re.sub(elem_pattern, "elemschool", school) 
               for school in elem_ex.schoolname]

elem_ex['cleaned_name_try1'] = new_schools
elem_ex[["schoolname", "cleaned_name_try1"]]

## A correct approach

Addresses issues above with `elementary school` and `elem.`

In [None]:
elem_pattern_try2 = r"(elem.*)(\s+)?(school)?"
    
new_schools_try2 = [re.sub(elem_pattern_try2, "elemschool", school) 
                    for school in elem_ex.schoolname]    

elem_ex['cleaned_name_try2'] = new_schools_try2
elem_ex[["schoolname", "cleaned_name_try1", "cleaned_name_try2"]]

# re.findall and re.search illustrations

**Task**: 

- Filter to `is_charter_exercise` == True; note that this contains a mix of schools with charter in the name and schools without
- Construct a pattern that, for charter schools, gets the school name prior to appearance of the word charter. School names without charter will not have matches (so Hanover Charter becomes Hanover; Hanover High stays Hanover High)


## re.findall 

In [None]:
## filter to charter exercise
charter_ex = schools_df[schools_df.is_charter_exercise].copy()
charter_ex.head(6)

In [None]:
## charter pattern
charter_pattern = r"(.*)\s+(charter)(\s+)?(\w+)?"

## findall 
test_charter_findall = [re.findall(charter_pattern, school) 
                        for school in charter_ex.schoolname]

## print result
test_charter_findall

In [None]:
## show example of one
print(test_charter_findall[1][0][0])

## re.search

In [None]:
## get matches
test_charter_search = [re.search(charter_pattern, school) 
                       for school in charter_ex.schoolname]

test_charter_search


In [None]:
## extract matches

### here, we're just focusing on the 3rd match or the 6th entry (thomas edison charter academy)
### and we're getting the first group from that match
thomas_match = test_charter_search[5]
thomas_match

### example where we're just getting the first group
### (name of school before charter)
thomas_firstgroup = thomas_match.group(1)
thomas_firstgroup


In [None]:
### iterate over all groups and print
for i in range(0, len(thomas_match.groups())+1):
    print("Group " + str(i) + " is: ")
    print(thomas_match.group(i))

## see error if we go beyond actual number of 
## groups thomas_match.group(5)

In [None]:
## can also extract the groups as a tuple
## example- want to return group 1 and group 2 and paste together
thomas_groups_all = thomas_match.groups()
thomas_groups_all

## slice the tuple
thomas_groups_all[0:2]


In [None]:
## can generalize to the full list with ifelse
def get_precharter_name(one_matchobj):
    
    if one_matchobj:
        school_name = one_matchobj.group(1)
    else:
        school_name = ""
    
    return(school_name)

all_charter_match = [get_precharter_name(one_search) 
                    for one_search in test_charter_search]

all_charter_match

# Group activity

## Part 1: Subsetting
Filter the data to only those rows where `is_highschool_exercise` is True.

In [16]:
high_ex = schools_df[schools_df.is_highschool_exercise].copy()
high_ex.head()

# hs, high, high school, "kings county office of education highland faci..."

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
26,mount pleasant area jshs,,N,False,False,True
27,huron high school,0.2867,N,False,False,True
28,thomson high school,0.4065,Y,False,False,True
29,kings county office of education highland faci...,,N,False,False,True
30,clovis east high,0.2634956587391468,N,False,False,True


## Part 2: Standardizing names
To find the names of high schools, try out some regex patterns to standardize the high school names (e.g., 'high school' and 'high' could both become 'highschool'). AKA make everything 'highschool.'

**Hint:** Look at the school names for hints on what to avoid matching--e.g., 'highland facility'. To avoid things like this, after 'high' or 'hs', have your pattern look for a space (`\s`) or the end of the string (`$`). 

In [56]:
# your code here
# hs_sub_pattern = r"[yourcodehere]"

# Try testing it on a few strings first
hs_sub_pattern = r"(high.*|hs.*)(\s+)?(school)?"
hs_sub_pattern = r"(high$|hs.*)(\s+)?(school)?"

hs_sub_pattern = r"(high$)|(high)\s(school)?|(hs$)|(jshs$)"
    
new_schools = [re.sub(hs_sub_pattern, "highschool", school) 
                    for school in high_ex.schoolname]    

high_ex["hs_cleaned_name"] = new_schools


high_ex[["schoolname", "hs_cleaned_name"]]


Unnamed: 0,schoolname,hs_cleaned_name
26,mount pleasant area jshs,mount pleasant area highschool
27,huron high school,huron highschool
28,thomson high school,thomson highschool
29,kings county office of education highland faci...,kings county office of education highland faci...
30,clovis east high,clovis east highschool
31,camden jr. high school,camden jr. highschool
32,jackson junior high,jackson junior highschool
33,emmett junior high school,emmett junior highschool
34,atkins high,atkins highschool
35,lexington senior high,lexington senior highschool


In [None]:
# your code here to standardize high school names

## Part 3: Match schools
Using some example results, try writing a regex pattern and using `re.match` to get the name of the school that precedes the 'highschool' part of the name (e.g., 'new trier highschool' -> 'new trier')

In [57]:
# hs_match_pattern = r"(.*high | .*hs)

# new_precedes = [re.sub(hs_match_pattern, "highschool", school) 
                    for school in high_ex.schoolname]  


hs_before_pattern = r”(.*)\s+(highschool)”
before_hs_names = [re.match(hs_before_pattern, one_str) for one_str in hs_df.standardized_highschool]
hs_df[‘before_highschool’] = before_hs_names
hs_df.head()
before_hs_list = []
for name in hs_df.before_highschool:
    before_hs_list.append(name[1])
before_hs_list
hs_df[‘before_highschool_name’] = before_hs_list
hs_df.head()

SyntaxError: unterminated string literal (detected at line 1) (126788479.py, line 1)

In [None]:
schoolname_preh_matchob = [re.match(prehs_pattern, school) 
                           for school in test_pat_examples]

schoolname_preh_

schooname_preh = [obj.group(1) if obj else ""
                  for obj in schooname_preh_matchobj]
schoolname_preh