In [1]:
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [2]:
cep_optin = pd.read_excel("https://frac.org/wp-content/uploads/2021SY-CEP_Database_Export.xlsx")


In [3]:
## clean colnames
new_colnames = [re.sub("[^A-Za-z0-9]+", "", col.lower()) for col in cep_optin.columns]

## add back
cep_optin.columns = new_colnames

cep_optin.head()

cep_optin['schoolname_lower'] = cep_optin.schoolname.str.lower()

## check whether name of school contains
## space followed by elem*
cep_optin['is_elem'] = np.where(cep_optin.schoolname_lower.str.contains("\s+elem", regex = True),
                True, False)


test_schools = cep_optin.loc[(cep_optin.is_elem) &
               (cep_optin.schoolname_lower.str.contains("elem\\.")) |
               (cep_optin.schoolname_lower.str.contains("elem")) |
               (cep_optin.schoolname_lower.str.contains("elementary")) |
               (cep_optin.schoolname_lower.str.contains("esd")),
            'schoolname_lower'].sample(n = 30, random_state = 470)

test_schools_show = test_schools.iloc[13:23]

test_schools_show

Unnamed: 0,state,leaidwhereavailable,leaname,schoolidwhereavailable,schoolname,individualispjune2020,proxyispoctober2019,claimingispifparticipating,participatingincepsy2021,studentenrollmentwhereavailable,unnamed10
0,Alabama,105-0000,Anniston City Board of Education,105-0010,Anniston High School,0.7873,,,N,457,
1,Alabama,105-0000,Anniston City Board of Education,105-0015,Anniston Middle School,0.7873,,,N,340,
2,Alabama,105-0000,Anniston City Board of Education,105-0025,Cobb Pre-K Academy,0.7873,,,N,139,
3,Alabama,105-0000,Anniston City Board of Education,105-0060,Golden Springs Elem School,0.7873,,,N,371,
4,Alabama,105-0000,Anniston City Board of Education,105-0110,Randolph Park Elem School,0.7873,,,N,320,


33114                paint branch elementary
21654       stewart county elementary school
15127        stove prairie elementary school
75308    winchester avenue elementary school
41410                         oak hill elem.
41527                  lewis and clark elem.
25420                   saunemin elem school
22656       desert springs elementary school
32269              fifth district elementary
59726               linden elementary school
Name: schoolname_lower, dtype: object

# Re.sub illustrations

**Task**: for the `schoolname` field, replace the different varieties of elementary school with `elemschool` in the field

## Incorrect approach 

Returns incorrect results that we'll see below

In [4]:
elem_pattern = r"elementary|elem|elem\\.|elementary school"

new_schools = [re.sub(elem_pattern, "elemschool", school) for school in test_schools_show]

old_and_new = pd.DataFrame({'orig_name': test_schools_show,
                           'cleaned_name': new_schools})

#print(old_and_new.to_latex(index = False))
old_and_new

Unnamed: 0,orig_name,cleaned_name
33114,paint branch elementary,paint branch elemschool
21654,stewart county elementary school,stewart county elemschool school
15127,stove prairie elementary school,stove prairie elemschool school
75308,winchester avenue elementary school,winchester avenue elemschool school
41410,oak hill elem.,oak hill elemschool.
41527,lewis and clark elem.,lewis and clark elemschool.
25420,saunemin elem school,saunemin elemschool school
22656,desert springs elementary school,desert springs elemschool school
32269,fifth district elementary,fifth district elemschool
59726,linden elementary school,linden elemschool school


### Question in class: would it work to change order of OR statement?


Answer: it gets closer (e.g., stewart county and stove prairie are fixed!) still have an issue with those with elem.

In [6]:
elem_pattern_difforder = r"elementary school|elementary|elem\\.|elem"

new_schools_difforder = [re.sub(elem_pattern_difforder, "elemschool", school) for school in test_schools_show]

new_schools_difforder

['paint branch elemschool',
 'stewart county elemschool',
 'stove prairie elemschool',
 'winchester avenue elemschool',
 'oak hill elemschool.',
 'lewis and clark elemschool.',
 'saunemin elemschool school',
 'desert springs elemschool',
 'fifth district elemschool',
 'linden elemschool']

## A correct approach

Addresses issues above with `elementary school` and `elem.`

In [23]:
elem_pattern_try2 = r"(elem.*)(\s+)?(school)?"
    
new_schools_try2 = [re.sub(elem_pattern_try2, "elemschool", school) 
                   for school in test_schools_show]    


old_and_new_try2 = pd.DataFrame({'orig_name': test_schools_show,
                           'cleaned_name': new_schools_try2})

#print(old_and_new_try2.to_latex(index = False))
old_and_new_try2

Unnamed: 0,orig_name,cleaned_name
33114,paint branch elementary,paint branch elemschool
21654,stewart county elementary school,stewart county elemschool
15127,stove prairie elementary school,stove prairie elemschool
75308,winchester avenue elementary school,winchester avenue elemschool
41410,oak hill elem.,oak hill elemschool
41527,lewis and clark elem.,lewis and clark elemschool
25420,saunemin elem school,saunemin elemschool
22656,desert springs elementary school,desert springs elemschool
32269,fifth district elementary,fifth district elemschool
59726,linden elementary school,linden elemschool


## Question from class - how do we tell re.something to ignore the case?

Answer: optional argument inside re: `flags = re.IGNORECASE` to ignore the case

In [29]:
orig_case_schools = cep_optin.schoolname.sample(n = 10, random_state = 54)

orig_case_schools

## do same pattern but with the re.ignorecase
orig_case_schools_sub = [re.sub(elem_pattern_try2, "elemschool", school, flags=re.IGNORECASE) 
                           for school in orig_case_schools]


## see that it matches things like Elementary despite capitalization
## leaves the capitalization the same but just does the replacement despite that
orig_case_schools_sub

## example also shows we may want to modify pattern to capture things like El

44664                      Madison Ave
51821            Jackson Co Sch of Alt
64908                     Oak Grove El
51553              Bessemer Elementary
27008           St. Joan of Arc School
26039          Clarksville High School
5334            WEST STREET ELEMENTARY
2163     Queen Creek Elementary School
49202           P.S. 8 Shirlee Solomon
29013                 Troy High School
Name: schoolname, dtype: object

['Madison Ave',
 'Jackson Co Sch of Alt',
 'Oak Grove El',
 'Bessemer elemschool',
 'St. Joan of Arc School',
 'Clarksville High School',
 'WEST STREET elemschool',
 'Queen Creek elemschool',
 'P.S. 8 Shirlee Solomon',
 'Troy High School']

# re.findall and re.search illustrations

**Task**: want to create pattern that, for charter schools, allows us to extract the school name prior to the appearance of charter. School names without charter will not have matches

## re.findall 

In [9]:

test_patterns = ["rebeccajohnson8", "rebeccajohnson88", "rebeccajohnson796"]

[re.findall(r"[a-z]+\d+", pat)[0] for pat in test_patterns]

['rebeccajohnson8', 'rebeccajohnson88', 'rebeccajohnson796']

In [10]:
## pull some charter examples and other examples
charter_examples = cep_optin.schoolname_lower[cep_optin.schoolname_lower.astype(str).str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()
other_examples = cep_optin.schoolname_lower[~cep_optin.schoolname_lower.astype(str).str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()


combined_examples = charter_examples + other_examples
combined_examples


['buffalo collegiate charter school',
 'thomas edison charter academy                                   ',
 'moving everest charter school',
 'life source international charter',
 'south valley academy charter school',
 'neighborhood charter school of harle',
 'brighter choice charter school-girls',
 "children's community charter",
 'frontier elementary school',
 'columbus humanities, arts and technology academy',
 'okemos public montessori-central',
 'pawhuska es',
 'east valley senior high',
 'glenpool es',
 'number 27',
 'south fork elementary']

In [11]:
## charter pattern
charter_pattern = r"(.*)\s+(charter)(\s+)?(\w+)?"

## findall 
test_charter_findall = [re.findall(charter_pattern, 
                    school) for school in combined_examples]

## print result
test_charter_findall



[[('buffalo collegiate', 'charter', ' ', 'school')],
 [('thomas edison', 'charter', ' ', 'academy')],
 [('moving everest', 'charter', ' ', 'school')],
 [('life source international', 'charter', '', '')],
 [('south valley academy', 'charter', ' ', 'school')],
 [('neighborhood', 'charter', ' ', 'school')],
 [('brighter choice', 'charter', ' ', 'school')],
 [("children's community", 'charter', '', '')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [12]:
## show example of one
print(test_charter_findall[0][0][0])

buffalo collegiate


## re.search

In [13]:
## get matches
test_charter_search = [re.search(charter_pattern, 
                    school) for school in combined_examples]

test_charter_search


[<re.Match object; span=(0, 33), match='buffalo collegiate charter school'>,
 <re.Match object; span=(0, 29), match='thomas edison charter academy'>,
 <re.Match object; span=(0, 29), match='moving everest charter school'>,
 <re.Match object; span=(0, 33), match='life source international charter'>,
 <re.Match object; span=(0, 35), match='south valley academy charter school'>,
 <re.Match object; span=(0, 27), match='neighborhood charter school'>,
 <re.Match object; span=(0, 30), match='brighter choice charter school'>,
 <re.Match object; span=(0, 28), match="children's community charter">,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [14]:
## extract matches

### here, we're just focusing on the 2nd match (thomas edison charter academy)
### and we're getting the first group from that match
thomas_match = test_charter_search[1]
thomas_match

### example where we're just getting the first group
### (name of school before charter)
thomas_firstgroup = thomas_match.group(1)
thomas_firstgroup


<re.Match object; span=(0, 29), match='thomas edison charter academy'>

'thomas edison'

In [15]:
### iterate over all groups and print
for i in range(0, len(thomas_match.groups())+1):
    print("Group " + str(i) + " is: ")
    print(thomas_match.group(i))

## see error if we go beyond actual number of 
## groups thomas_match.group(5)

Group 0 is: 
thomas edison charter academy
Group 1 is: 
thomas edison
Group 2 is: 
charter
Group 3 is: 
 
Group 4 is: 
academy


## Question from class - is there a way to pull multiple matched groups at one by feeding .group() something like a list of indices

Response: if you do object.groups() with no index fed, it returns a tuple of groups. You can then slices/subset that tuple using indices

In [22]:
## example- want to return group 1 and group 2 and paste together
thomas_groups_all = thomas_match.groups()
thomas_groups_all

## slice the tuple
thomas_groups_all[0:2]

## do in one step


thomas_groups_12 = thomas_match.groups()[0:2]
thomas_groups_12

('thomas edison', 'charter', ' ', 'academy')

('thomas edison', 'charter')

('thomas edison', 'charter')

In [138]:
## can generalize to the full list with ifelse
def get_precharter_name(one_matchobj):
    
    if one_matchobj:
        school_name = one_matchobj.group(1)
    else:
        school_name = ""
    
    return(school_name)

all_charter_match = [get_precharter_name(one_search) 
                    for one_search in test_charter_search]

all_charter_match

['buffalo collegiate',
 'thomas edison',
 'moving everest',
 'life source international',
 'south valley academy',
 'neighborhood',
 'brighter choice',
 "children's community",
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

# Group activity

- Return to the full list of school names in the original data
- You want to find the names of high schools. Try out some patterns to standardize the high school names (e.g., `high school` and `high` could both become `highschool`)
- Then, using some example results, try writing a regex pattern and using re.match to get the name of the school that precedes the `highschool` part of the name (e.g., `new trier highschool` -> `new trier`)



### Standardizing high school name

In [32]:

### first pull out some examples to test one
hs_examples = cep_optin.schoolname_lower[cep_optin.schoolname_lower.astype(str).str.contains("high|hs")].sample(n = 15,
                    random_state = 422).to_list()

hs_examples


['mount pleasant area jshs',
 'huron high school',
 'thomson high school',
 'kings county office of education highland facility',
 'clovis east high',
 'camden jr. high school',
 'jackson junior high',
 'emmett junior high school',
 'atkins high',
 'lexington senior high',
 'temple hs',
 'forest hill high school',
 'pittsfield high',
 'matanzas high school',
 'pontiac high school']

In [39]:

## for now, ignoring jr/senior distinction
## and matching on high school, high, and hs

## to avoid matching things like highland, 
## after high or hs, add (\s|$) that tells it 
## to either look for a space or look for the 
## end of the string
hs_sub_pattern = r"(\shigh(\s|$)|\shs(\s|$)?)(\s+)?(school)?"
test_pat_examples = [re.sub(hs_sub_pattern, " highschool", example) 
                    for example in hs_examples]

test_pat_examples

['mount pleasant area jshs',
 'huron highschool',
 'thomson highschool',
 'kings county office of education highland facility',
 'clovis east highschool',
 'camden jr. highschool',
 'jackson junior highschool',
 'emmett junior highschool',
 'atkins highschool',
 'lexington senior highschool',
 'temple highschool',
 'forest hill highschool',
 'pittsfield highschool',
 'matanzas highschool',
 'pontiac highschool']

In [42]:
### apply over all and assign as a new column
### since we're pulling from original df
### casting it to string since was object
hs_clean_all = [re.sub(hs_sub_pattern, " highschool", str(oneschool)) 
                    for oneschool in cep_optin.schoolname_lower.to_list()]


### assign as col
cep_optin['school_cleanhs'] = hs_clean_all

### With some examples, pulling out name of school before high

In [47]:
## using the test_pat_examples and want to get things like huron, thomson, clovis east
prehs_pattern = r"(.*)\s+(highschool)(\s+)?(\w+)?"

schoolname_preh_matchobj  = [re.search(prehs_pattern, 
                    school) for school in test_pat_examples]

schoolname_preh_matchobj

## get the first group if exists; else return empty string
schoolname_preh = [obj.group(1) if obj else "" for obj in schoolname_preh_matchobj]
schoolname_preh

[None,
 <re.Match object; span=(0, 16), match='huron highschool'>,
 <re.Match object; span=(0, 18), match='thomson highschool'>,
 None,
 <re.Match object; span=(0, 22), match='clovis east highschool'>,
 <re.Match object; span=(0, 21), match='camden jr. highschool'>,
 <re.Match object; span=(0, 25), match='jackson junior highschool'>,
 <re.Match object; span=(0, 24), match='emmett junior highschool'>,
 <re.Match object; span=(0, 17), match='atkins highschool'>,
 <re.Match object; span=(0, 27), match='lexington senior highschool'>,
 <re.Match object; span=(0, 17), match='temple highschool'>,
 <re.Match object; span=(0, 22), match='forest hill highschool'>,
 <re.Match object; span=(0, 21), match='pittsfield highschool'>,
 <re.Match object; span=(0, 19), match='matanzas highschool'>,
 <re.Match object; span=(0, 18), match='pontiac highschool'>]

['',
 'huron',
 'thomson',
 '',
 'clovis east',
 'camden jr.',
 'jackson junior',
 'emmett junior',
 'atkins',
 'lexington senior',
 'temple',
 'forest hill',
 'pittsfield',
 'matanzas',
 'pontiac']