# Clean and link filings



## 0. Imports and functions

In [262]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time

## plotting
from plotnine import *



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    
    return(clean_lea)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



def replace_missing_nces(one_val):
    
    if one_val.isdigit():
        final_val = one_val
    else:
        final_val = np.nan
    return(final_val)



In [4]:
## constants
base_path = "/Users/raj2/Dropbox/dph_hearing_decisions/"

'/Users/raj2/Dropbox/dph_hearing_decisions/'

## 1. Load and do prelim cleaning of filings data

In [7]:
dc_filings_init = pd.read_csv(base_path + "data/dc/intermediate/processed_filings.csv")

dc_filings_init['failed_parse'] = np.where(dc_filings_init.eq(dc_filings_init.iloc[:, 0], 
                                axis=0).all(1), 1, 0)


## get row number of those that failed parse to reprocess
rownums_failedparse = pd.DataFrame({'missing_info':
                dc_filings_init.loc[dc_filings_init.failed_parse == 1].index.tolist()})

## write those and go back to process tables, pulling all cols for those rows
rownums_failedparse.to_pickle(base_path + "data/dc/intermediate/rownums_failedparse.pickle")


## subset to ones that parsed
dc_filings = dc_filings_init.loc[dc_filings_init.failed_parse == 0, ].copy()

"""After removing those that failed to parse, go from {} filings to {} filings.
""".format(dc_filings_init.shape[0],
          dc_filings.shape[0])



'After removing those that failed to parse, go from 7949 filings to 7752 filings.\n'

In [8]:
dc_filings['casetype_clean_init'] = [process_type(one_type) for one_type in dc_filings.casetype.tolist()]
dc_filings['casetype_isdigits'] = ["digits" if re.match(r'[0-9]+', one_str) is not None  else "no_digits" 
        for one_str in dc_filings.casetype_clean_init]

## by subsetting to those, see that year is still in the case so don't need to use for that
dc_filings['casetype_final'] = np.where((dc_filings.casetype_clean_init.str.contains("Discip")) |
                                        (dc_filings.casetype_clean_init.str.contains("Expedited")),
                                        "Expedited Discipline",
                                np.where((dc_filings.casetype_clean_init.str.contains("LEA")) & 
                                         (dc_filings.casetype_clean_init != "By LEA"), "Against LEA",
                                np.where(dc_filings.casetype_clean_init == "By LEA", "By LEA",
                                np.where(dc_filings.casetype_clean_init.str.contains("Against SE"),
                                        "Against SEA",
                                        "Other/failed to parse"))))


## write the failed to parse ones
## write those and go back to process tables, pulling the rows manually
dc_filings[dc_filings.casetype_final == "Other/failed to parse"].to_csv(base_path + "data/dc/intermediate/missing_casetype.csv")


## get range of dates of the filings
dc_filings['year_init'] = [re.sub(r'^(20[1-2][0-9]).*', r'\1', str(one_string)) for one_string in 
                      dc_filings.case_no]
year_range = [str(i) for i in np.arange(2012, 2020).tolist()]
dc_filings['year'] = np.where(dc_filings.year_init.isin(year_range), dc_filings.year_init,
                             'failed_toparse')
dc_filings.year.value_counts() # half the year in 2019

## 

2012              1651
2013              1459
2014              1023
2015              1001
2018               834
2017               702
2016               626
2019               269
failed_toparse     187
Name: year, dtype: int64

## 2. Merge in district demographic data

### 2.1 Create name-nces ID crosswalk

In [259]:
## 75-col limit in export-- first 75 cols
cc_data_1 = pd.read_csv(base_path + "data/dc/intermediate/dc_ccd.csv")

## 
cc_data_2 = pd.read_csv(base_path + "data/dc/intermediate/dc_ccd_pull2.csv",
                       encoding= 'unicode_escape')

## find overlapping cols
cc_data_1_topull = cc_data_1.columns.difference(cc_data_2.columns).tolist() + \
                    ["School ID - NCES Assigned [Public School] Latest available year"]


In [260]:
## merge excluding
cc_data_merged = pd.merge(cc_data_1[cc_data_1_topull], 
                          cc_data_2, 
                          on = "School ID - NCES Assigned [Public School] Latest available year",
                         how = "left")



In [261]:
cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in cc_data_merged.columns]
cc_data_merged.columns = cc_cleancols


In [328]:


## create crosswalk to do matching
cc_crosswalk = cc_data_merged[['SCHOOL_NAME', 
                        'SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR']].copy().drop_duplicates()


cc_crosswalk['name_tocompare'] = [replace_schooltype(one_school) for one_school in cc_crosswalk.SCHOOL_NAME]
cc_crosswalk.head()

Unnamed: 0,SCHOOL_NAME,SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR,name_tocompare
0,ACADEMY OF HOPE ADULT PCS,110009000000.0,ACADEMY OF HOPE ADULT PCS
1,COMMUNITY COLLEGE PREPARATORY ACADEMY PCS,110009000000.0,COMMUNITY COLLEGE PREPARATORY ACADEMY PCS
2,DC SCHOLARS PCS,110009000000.0,DC SCHOLARS PCS
3,INGENUITY PREP PCS,110009000000.0,INGENUITY PREP PCS
4,LAYC CAREER ACADEMY PCS,110009000000.0,LAYC CAREER ACADEMY PCS


## 2.2 Clean school name and fuzzy matching to IDs

In [245]:
## preprocess school to clean
## and do fuzzy matching
dc_filings_tomatch = dc_filings[['case_no', 'dcps_school_against', 'year', 'casetype_final']].drop_duplicates(subset = 
                                                ['case_no',
                                                'dcps_school_against'])



dc_filings_tomatch['school_against_cleaned_1'] = [process_schoolname(one_name) 
                                                for one_name in dc_filings_tomatch.dcps_school_against]
dc_filings_tomatch['school_against_cleaned'] = [replace_schooltype(one_name)
                                               for one_name in dc_filings_tomatch.school_against_cleaned_1]



## generate tf-idf representation
filings_crosswalk = dc_filings_tomatch[['school_against_cleaned']].drop_duplicates()
filings_crosswalk['id'] = filings_crosswalk.index+1


## write to intermediate
cc_crosswalk.to_csv(base_path + "data/dc/intermediate/nces_schoolnames.csv")
filings_crosswalk.to_csv(base_path + "data/dc/intermediate/filings_names.csv")


id_rename_dict = {'SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR': 
                      'nces_id'}



In [476]:
## Ran script: 03helper_fuzzymatch_nces.csv

## Matche 

filings_crosswalk.head()

Unnamed: 0,school_against_cleaned,id
0,NAN,1
1,PROSPECT LC,2
5,HEIGHTS EDUCATION CENTER HIGH SCHOOL,6
7,JOHNSON MIDDLE SCHOOL,8
9,EASTERN HIGH SCHOOL,10


### 2.3 merge in fuzzy match results

In [485]:
## load results of fmatch
fm_cc_filings = pd.read_csv(base_path + "data/dc/intermediate/nces_filings_fuzzymatch.csv")



## merge back using original name
filings_crosswalk_wmatch = pd.merge(filings_crosswalk, 
                                   fm_cc_filings[['original_name', 'matched_name', 'score']],
                                   left_on = 'school_against_cleaned',
                                   right_on = 'original_name',
                                   how = "left")

filings_crosswalk_wmatch['matched'] = np.where(filings_crosswalk_wmatch.score.notnull(), 1, 0)

## write the non-matched ones to csv and deal with later
## filtered out "other" and "non-attending"
filings_crosswalk_wmatch.loc[filings_crosswalk_wmatch.matched == 0].to_csv(base_path + "data/dc/intermediate/nonmatch_schoolname.csv")

## manually matched them 

manualmatch = pd.read_csv(base_path + "data/dc/intermediate/manual_nonmatch_dc.csv")
manualmatch['matched_manually'] = np.where(manualmatch.matched_name.notnull(), 1, 0)
manualmatch.matched_manually.value_counts()

manualmatch_ccd = manualmatch.loc[(manualmatch.source == "ccd") & 
                                 (manualmatch.multiple == 0) & 
                                 (manualmatch.matched_manually == 1)].copy().drop_duplicates() # filters out 
                                    # ones that matched to crdc and multi-campus pcs
manualmatch_ccd['score'] = np.nan

## merge with filings 
filings_crosswalk_wmanualmatch = pd.merge(filings_crosswalk, 
                                   manualmatch_ccd,
                                   left_on = 'school_against_cleaned',
                                   right_on = 'original_name',
                                   how = "left")

filings_crosswalk_wmanualmatch['matched_manually'] = np.where(filings_crosswalk_wmanualmatch.matched_name.notnull(),
                                        1, 0)

filings_crosswalk_wmanualmatch_matched = filings_crosswalk_wmanualmatch.loc[filings_crosswalk_wmanualmatch.matched_manually == 1].copy()
filings_crosswalk_wmanualmatch_matched.head()


## rowbind into one crosswalk
filings_crosswalk_fuzzy_matched = filings_crosswalk_wmatch.loc[filings_crosswalk_wmatch.matched == 1].copy()

## Combined crosswalk
filings_crosswalk_both = pd.concat([filings_crosswalk_fuzzy_matched[['school_against_cleaned',
                                                                    'original_name',
                                                                    'matched_name','score']],
                                   filings_crosswalk_wmanualmatch_matched[['school_against_cleaned',
                                                                    'original_name',
                                                                    'matched_name','score']]])

filings_crosswalk_both['type_match'] = np.where(filings_crosswalk_both.score.notnull(),
                                               "fuzzy",
                                               "manual")

filings_crosswalk_both.head()

## write filings not in crosswalk
filings_crosswalk_notmatched = filings_crosswalk.loc[~filings_crosswalk.school_against_cleaned.isin(fm_cc_filings.original_name.tolist() +
                            manualmatch.original_name[manualmatch.matched_manually == 1].tolist())].copy()

#print(filings_crosswalk_notmatched[['school_against_cleaned']].sort_values(by = 
 #                                       "school_against_cleaned").to_latex(index = False))


1    249
0    109
Name: matched_manually, dtype: int64

Unnamed: 0,school_against_cleaned,id,original_name,matched_name,source,multiple,matched_manually,score
10,MONROE SCHOOL,22,MONROE SCHOOL,BRUCE MONROE ES AT PARK VIEW,ccd,0.0,1,
11,SPINGARN SENIOR HIGH SCHOOL,24,SPINGARN SENIOR HIGH SCHOOL,SPINGARN SHIGH SCHOOL,ccd,0.0,1,
22,BENJAM IN BANNEKER ACADEM IC HIGH SCHOOL,66,BENJAM IN BANNEKER ACADEM IC HIGH SCHOOL,BENJAMIN BANNEKER HIGH SCHOOL,ccd,0.0,1,
27,SIMON EIE ME N TA RY SCHOOL,76,SIMON EIE ME N TA RY SCHOOL,SIMON ELEMENTARY SCHOOL,ccd,0.0,1,
32,WOODSON ACADEMY@,88,WOODSON ACADEMY@,RON BROWN MIDDLE SCHOOL,ccd,0.0,1,


Unnamed: 0,school_against_cleaned,original_name,matched_name,score,type_match
1,PROSPECT LC,PROSPECT LC,PROSPECT LC,100.0,fuzzy
3,JOHNSON MIDDLE SCHOOL,JOHNSON MIDDLE SCHOOL,JOHNSON MIDDLE SCHOOL,100.0,fuzzy
4,EASTERN HIGH SCHOOL,EASTERN HIGH SCHOOL,EASTERN HIGH SCHOOL,100.0,fuzzy
5,EASTE RN HIGH SCHOOL,EASTE RN HIGH SCHOOL,EASTERN HIGH SCHOOL,97.0,fuzzy
6,DUNBAR HIGH SCHOOL,DUNBAR HIGH SCHOOL,DUNBAR HIGH SCHOOL,100.0,fuzzy


In [486]:


## merge in nces ID based on matched name
filings_crosswalk_wid = pd.merge(filings_crosswalk_both,
                                cc_crosswalk,
                                 left_on = 'matched_name',
                                 right_on = 'name_tocompare',
                                 how = "left").drop_duplicates()

filings_crosswalk_wid.rename(columns = id_rename_dict, inplace = True)
filings_crosswalk_tomerge = filings_crosswalk_wid[['nces_id', 'school_against_cleaned',
                                                  'name_tocompare']].copy()
filings_crosswalk_tomerge.rename(columns = {'name_tocompare': 'nces_name'},
                                inplace = True)


### 2.4 merge with main case file

In [487]:
## merge back on to main data

dc_filings_wid = pd.merge(dc_filings_tomatch[['case_no',
                                             'dcps_school_against',
                                             'year',
                                             'school_against_cleaned',
                                             'casetype_final']],
                         filings_crosswalk_tomerge,
                         on = 'school_against_cleaned',
                         how = 'left')
dc_filings_wid.info()

## only able to match 60% to an nces id; seems most important troubleshooting is 
## improving crosswalk
dc_filings_wid['caseid_stripyear'] = dc_filings_wid.case_no.replace(r'20[1-9][0-9]\-(\-)?', '', regex = True)
dc_filings_wid['caseid_firstfiled'] = dc_filings_wid.caseid_stripyear.replace(r'\-(-)?.*', '', regex = True)
dc_filings_wid['caseid'] = dc_filings_wid.caseid_firstfiled.str.replace(' ', '')
dc_filings_wid_obsid = dc_filings_wid.loc[dc_filings_wid.nces_id.notnull()].copy()

# 86% match (still need to do multiple match ones etc)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7295 entries, 0 to 7294
Data columns (total 7 columns):
case_no                   7293 non-null object
dcps_school_against       7294 non-null object
year                      7295 non-null object
school_against_cleaned    7295 non-null object
casetype_final            7295 non-null object
nces_id                   6258 non-null float64
nces_name                 6258 non-null object
dtypes: float64(1), object(6)
memory usage: 455.9+ KB


In [488]:


## remove missing and aggregate
dc_filings_valid = dc_filings_wid_obsid[['nces_id',
                                'caseid',
                                 'year',
                                'nces_name',
                                'casetype_final']].drop_duplicates()

dc_filings_valid.shape
dc_filings_valid['id_foragg'] = dc_filings_valid.caseid + dc_filings_valid.year

dc_filings_totalbyschool = dc_filings_valid.groupby(['nces_name',
                                                    'casetype_final'])['id_foragg'].nunique().reset_index().sort_values(by =
                    'nces_name')

dc_filings_totalbyschool.rename(columns = {'id_foragg': 'count_filings'}, inplace = True)
dc_filings_totalbyschool.head()

## add nces id
dc_filings_wnces = pd.merge(dc_filings_totalbyschool,
                           cc_crosswalk[['name_tocompare'] + id_col],
                           left_on = 'nces_name',
                           right_on = 'name_tocompare',
                           how = 'left').drop_duplicates()
dc_filings_wnces.rename(columns = id_rename_dict, inplace = True)

dc_filings_wnces.head()
dc_filings_totalbyschool.shape




(4526, 5)

Unnamed: 0,nces_name,casetype_final,count_filings
0,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,Against LEA,7
1,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,Expedited Discipline,1
2,ADAMS ELEMENTARY SCHOOL ...,Against LEA,5
3,ADVANCED PATH ACADEMY,Against LEA,4
4,AITON ELEMENTARY SCHOOL,Against LEA,43


Unnamed: 0,nces_name,casetype_final,count_filings,name_tocompare,nces_id
0,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,Against LEA,7,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,110007000000.0
1,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,Expedited Discipline,1,ACHIEVEMENT PREPARATORY PCS ELEMENTARY SCHOOL,110007000000.0
2,ADAMS ELEMENTARY SCHOOL ...,Against LEA,5,ADAMS ELEMENTARY SCHOOL ...,110003000000.0
3,ADVANCED PATH ACADEMY,Against LEA,4,ADVANCED PATH ACADEMY,110003000000.0
4,AITON ELEMENTARY SCHOOL,Against LEA,43,AITON ELEMENTARY SCHOOL,110003000000.0


(369, 3)

# 3. Clean demographics

### 3.1-- common core data

In [156]:
id_col = ['SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR']
enrollment_vars = [col for col in cc_data_merged if 'TOTAL_STUDENTS' in col]
base_name_raw = 'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___PUBLIC_SCHOOL_'


def create_aggregation(var_pattern, varname_clean, id_col = id_col, 
                       cc_data_merged = cc_data_merged,
                      enrollment_vars = enrollment_vars, base_name_raw = base_name_raw):
    
    dem_vars = [col for col in cc_data_merged if var_pattern in col]
    long_df = pd.melt(cc_data_merged[dem_vars + enrollment_vars + id_col],
                       id_vars = id_col)
    
    ## create year versus dem col
    long_df['clean_value_1'] = [replace_missing_nces(val) for val in long_df.value]
    long_df['clean_value'] = pd.to_numeric(long_df.clean_value_1)
    long_df['which_var'] = long_df.variable.astype(str).str.replace("\\_20.*", "")
    replace_pattern = "|".join(long_df.which_var.unique())
    long_df['which_year'] = [re.sub(replace_pattern, "", one_var) for one_var in long_df.variable]
    long_toagg = long_df[id_col + ['which_var', 'which_year',
                                           'clean_value']].reset_index()

    ## do the aggregation 
    index_cols = id_col + ["which_year"]
    df_wide = long_toagg.pivot_table(index  = index_cols,
                                             values = 'clean_value',
                                             columns = 'which_var').reset_index()

    ## do the division
    rate_varname = varname_clean + '_rate'
    df_wide[rate_varname] = df_wide[var_pattern]/df_wide[base_name_raw]
    #print(df_wide.head())
    
    ## return cleaned data
    return(df_wide[id_col + [rate_varname] + ['which_year']])
    
    
    

In [160]:
frpl_rate = create_aggregation(var_pattern = "FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL_",
                              varname_clean = "frpl_eligible")


In [158]:
race_enrollment_vars = [col for col in cc_data_merged.columns if "TOTAL_RACE" in col]
black_rate = create_aggregation(var_pattern = "BLACK_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "black",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_")
white_rate = create_aggregation(var_pattern = "WHITE_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "white",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_")
hisp_rate = create_aggregation(var_pattern = "HISPANIC_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "hispanic",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_")

In [159]:
## merge into one df
dfs = [df.set_index(id_col + ['which_year']) for df in [frpl_rate, black_rate, white_rate, hisp_rate]]

cc_dem_rates = pd.concat(dfs, axis=1).reset_index()
cc_dem_rates.rename(columns = {'SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR': 
                      'nces_id'}, inplace = True)
cc_dem_rates.head()

## use demographics at baseline-ish
## maybe exclude 2012



which_var,nces_id,which_year,frpl_eligible_rate,black_rate,white_rate,hispanic_rate
0,110000000000.0,_2012-13,0.934,0.972,0.0,0.025
1,110000000000.0,_2013-14,0.992,0.976,0.0,0.024
2,110000000000.0,_2014-15,,,,
3,110000000000.0,_2015-16,,,,
4,110000000000.0,_2016-17,,,,


### 3.2 CCD data on ieps


In [141]:
## merge in on agency id

['HISPANIC_STUDENTS__PUBLIC_SCHOOL__2017-18',
 'HISPANIC_STUDENTS__PUBLIC_SCHOOL__2016-17',
 'HISPANIC_STUDENTS__PUBLIC_SCHOOL__2015-16',
 'HISPANIC_STUDENTS__PUBLIC_SCHOOL__2014-15',
 'HISPANIC_STUDENTS__PUBLIC_SCHOOL__2013-14',
 'HISPANIC_STUDENTS__PUBLIC_SCHOOL__2012-13']

['TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2017-18',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2016-17',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2015-16',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2014-15',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2013-14',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2012-13']

In [113]:
## clean demographics

## two sets of variables:
## frpl
## race
frpl_vars = [col for col in cc_data_merged if 'FREE_LUNCH_ELIGIBLE' in col]

frpl_vars

['FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2012-13',
 'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2013-14',
 'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2014-15',
 'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2015-16',
 'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2016-17',
 'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2017-18']