In [127]:
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [4]:
cep_optin = pd.read_excel("https://frac.org/wp-content/uploads/2021SY-CEP_Database_Export.xlsx")


['state',
 'leaidwhereavailable',
 'leaname',
 'schoolidwhereavailable',
 'schoolname',
 'individualispjune2020',
 'proxyispoctober2019',
 'claimingispifparticipating',
 'participatingincepsy2021',
 'studentenrollmentwhereavailable',
 'unnamed10']

In [66]:
## clean colnames
new_colnames = [re.sub("[^A-Za-z0-9]+", "", col.lower()) for col in cep_optin.columns]

## add back
cep_optin.columns = new_colnames

cep_optin.head()

cep_optin['schoolname_lower'] = cep_optin.schoolname.str.lower()

## check whether name of school contains
## space followed by elem*
cep_optin['is_elem'] = np.where(cep_optin.schoolname_lower.str.contains("\s+elem", regex = True),
                True, False)


test_schools = cep_optin.loc[(cep_optin.is_elem) &
               (cep_optin.schoolname_lower.str.contains("elem\\.")) |
               (cep_optin.schoolname_lower.str.contains("elem")) |
               (cep_optin.schoolname_lower.str.contains("elementary")) |
               (cep_optin.schoolname_lower.str.contains("esd")),
            'schoolname_lower'].sample(n = 30, random_state = 470)

test_schools_show = test_schools.iloc[13:23]

test_schools_show

33114                paint branch elementary
21654       stewart county elementary school
15127        stove prairie elementary school
75308    winchester avenue elementary school
41410                         oak hill elem.
41527                  lewis and clark elem.
25420                   saunemin elem school
22656       desert springs elementary school
32269              fifth district elementary
59726               linden elementary school
Name: schoolname_lower, dtype: object

# Re.sub illustrations

**Task**: for the `schoolname` field, replace the different varieties of elementary school with `elemschool` in the field

## Incorrect approach 

Returns incorrect results that we'll see below

In [140]:
elem_pattern = r"elementary|elem|elem\\.|elementary school"

new_schools = [re.sub(elem_pattern, "elemschool", school) for school in test_schools_show]

old_and_new = pd.DataFrame({'orig_name': test_schools_show,
                           'cleaned_name': new_schools})

#print(old_and_new.to_latex(index = False))
old_and_new

Unnamed: 0,orig_name,cleaned_name
33114,paint branch elementary,paint branch elemschool
21654,stewart county elementary school,stewart county elemschool school
15127,stove prairie elementary school,stove prairie elemschool school
75308,winchester avenue elementary school,winchester avenue elemschool school
41410,oak hill elem.,oak hill elemschool.
41527,lewis and clark elem.,lewis and clark elemschool.
25420,saunemin elem school,saunemin elemschool school
22656,desert springs elementary school,desert springs elemschool school
32269,fifth district elementary,fifth district elemschool
59726,linden elementary school,linden elemschool school


## Correct approach

Addresses issues above with `elementary school` and `elem.`

In [141]:
elem_pattern_try2 = r"(elem.*)(\s+)?(school)?"
    
new_schools_try2 = [re.sub(elem_pattern_try2, "elemschool", school) 
                   for school in test_schools_show]    


old_and_new_try2 = pd.DataFrame({'orig_name': test_schools_show,
                           'cleaned_name': new_schools_try2})

#print(old_and_new_try2.to_latex(index = False))
old_and_new_try2

Unnamed: 0,orig_name,cleaned_name
33114,paint branch elementary,paint branch elemschool
21654,stewart county elementary school,stewart county elemschool
15127,stove prairie elementary school,stove prairie elemschool
75308,winchester avenue elementary school,winchester avenue elemschool
41410,oak hill elem.,oak hill elemschool
41527,lewis and clark elem.,lewis and clark elemschool
25420,saunemin elem school,saunemin elemschool
22656,desert springs elementary school,desert springs elemschool
32269,fifth district elementary,fifth district elemschool
59726,linden elementary school,linden elemschool


# re.findall and re.search illustrations

**Task**: want to create pattern that, for charter schools, allows us to extract the school name prior to the appearance of charter. School names without charter will not have matches

## re.findall 

In [98]:

test_patterns = ["rebeccajohnson8", "rebeccajohnson88", "rebeccajohnson796"]

[re.findall(r"[a-z]+\d+", pat)[0] for pat in test_patterns]

['rebeccajohnson8', 'rebeccajohnson88', 'rebeccajohnson796']

In [139]:
## pull some charter examples and other examples
charter_examples = cep_optin.schoolname_lower[cep_optin.schoolname_lower.astype(str).str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()
other_examples = cep_optin.schoolname_lower[~cep_optin.schoolname_lower.astype(str).str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()


combined_examples = charter_examples + other_examples
combined_examples


['buffalo collegiate charter school',
 'thomas edison charter academy                                   ',
 'moving everest charter school',
 'life source international charter',
 'south valley academy charter school',
 'neighborhood charter school of harle',
 'brighter choice charter school-girls',
 "children's community charter",
 'frontier elementary school',
 'columbus humanities, arts and technology academy',
 'okemos public montessori-central',
 'pawhuska es',
 'east valley senior high',
 'glenpool es',
 'number 27',
 'south fork elementary']

In [120]:
## charter pattern
charter_pattern = r"(.*)\s+(charter)(\s+)?(\w+)?"

## findall 
test_charter_findall = [re.findall(charter_pattern, 
                    school) for school in combined_examples]

## print result
test_charter_findall



[[('buffalo collegiate', 'charter', ' ', 'school')],
 [('thomas edison', 'charter', ' ', 'academy')],
 [('moving everest', 'charter', ' ', 'school')],
 [('life source international', 'charter', '', '')],
 [('south valley academy', 'charter', ' ', 'school')],
 [('neighborhood', 'charter', ' ', 'school')],
 [('brighter choice', 'charter', ' ', 'school')],
 [("children's community", 'charter', '', '')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [118]:
## show example of one
print(test_charter_findall[0][0][0])

buffalo collegiate


## re.search

In [121]:
## get matches
test_charter_search = [re.search(charter_pattern, 
                    school) for school in combined_examples]

test_charter_search


[<re.Match object; span=(0, 33), match='buffalo collegiate charter school'>,
 <re.Match object; span=(0, 29), match='thomas edison charter academy'>,
 <re.Match object; span=(0, 29), match='moving everest charter school'>,
 <re.Match object; span=(0, 33), match='life source international charter'>,
 <re.Match object; span=(0, 35), match='south valley academy charter school'>,
 <re.Match object; span=(0, 27), match='neighborhood charter school'>,
 <re.Match object; span=(0, 30), match='brighter choice charter school'>,
 <re.Match object; span=(0, 28), match="children's community charter">,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [137]:
## extract matches

### here, we're just focusing on the 2nd match (thomas edison charter academy)
### and we're getting the first group from that match
thomas_match = test_charter_search[1]
thomas_match

### example where we're just getting the first group
### (name of school before charter)
thomas_firstgroup = thomas_match.group(1)
thomas_firstgroup


<re.Match object; span=(0, 29), match='thomas edison charter academy'>

'thomas edison'

In [136]:
### iterate over all groups and print
for i in range(0, len(thomas_match.groups())+1):
    print("Group " + str(i) + " is: ")
    print(thomas_match.group(i))

## see error if we go beyond actual number of 
## groups thomas_match.group(5)

Group 0 is: 
thomas edison charter academy
Group 1 is: 
thomas edison
Group 2 is: 
charter
Group 3 is: 
 
Group 4 is: 
academy


In [138]:
## can generalize to the full list with ifelse
def get_precharter_name(one_matchobj):
    
    if one_matchobj:
        school_name = one_matchobj.group(1)
    else:
        school_name = ""
    
    return(school_name)

all_charter_match = [get_precharter_name(one_search) 
                    for one_search in test_charter_search]

all_charter_match

['buffalo collegiate',
 'thomas edison',
 'moving everest',
 'life source international',
 'south valley academy',
 'neighborhood',
 'brighter choice',
 "children's community",
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

# Group activity

- Return to the full list of school names in the original data
- You want to find the names of high schools. Try out some patterns to standardize the high school names (e.g., `high school` and `high` could both become `highschool`)
- Then, using some example results, try writing a regex pattern and using re.match to get the name of the school that precedes the `highschool` part of the name (e.g., `new trier highschool` -> `new trier`)

