# Clean and link filings



## 0. Imports and functions

In [710]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time

## plotting
from plotnine import *



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    
    return(clean_lea)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



def replace_missing_nces(one_val):
    
    if one_val.isdigit():
        final_val = one_val
    else:
        final_val = np.nan
    return(final_val)

def aggregate_crdc(var_col, value_col, 
                  data = crdc,
                  id_col = "school_name",
                  year_chosen = 2013, format = "long"):
    
    
    ## shape from long to wide
    if(format == "long"):
        df_wide = pd.pivot_table(data.loc[data.year == year_chosen,
                    [id_col,
                    var_col, 
                    value_col]],
                    index  = id_col, 
                    values = value_col,
                    columns = var_col).reset_index()

    
    else:
        df_wide = data.loc[data.year == year_chosen].copy()
        
    ## standardize columns
    df_wide.columns = [re.sub("\s+", "_", col.upper()) 
                           for col in df_wide.columns]
        
    ## generate rates
    sub_cols = set(df_wide.columns).difference(["SCHOOL_NAME", "TOTAL", 'YEAR'])
    
    ## 
    for col in sub_cols:
        df_wide[col] = pd.to_numeric(df_wide[col])
        df_wide['TOTAL'] = pd.to_numeric(df_wide['TOTAL'])
        df_wide['{}_rate'.format(col)] = df_wide[col]/df_wide['TOTAL']
    
    ## return
    return(df_wide)
    
def aggregate_nces(var_pattern, varname_clean, id_col = id_col, 
                       cc_data_merged = cc_data_merged,
                      enrollment_vars = enrollment_vars, base_name_raw = base_name_raw):
    
    dem_vars = [col for col in cc_data_merged if var_pattern in col]
    long_df = pd.melt(cc_data_merged[dem_vars + enrollment_vars + id_col],
                       id_vars = id_col)
    
    ## create year versus dem col
    long_df['clean_value_1'] = [replace_missing_nces(val) for val in long_df.value]
    long_df['clean_value'] = pd.to_numeric(long_df.clean_value_1)
    long_df['which_var'] = long_df.variable.astype(str).str.replace("\\_20.*", "")
    replace_pattern = "|".join(long_df.which_var.unique())
    long_df['which_year'] = [re.sub(replace_pattern, "", one_var) for one_var in long_df.variable]
    long_toagg = long_df[id_col + ['which_var', 'which_year',
                                           'clean_value']].reset_index()

    ## do the aggregation 
    index_cols = id_col + ["which_year"]
    df_wide = long_toagg.pivot_table(index  = index_cols,
                                             values = 'clean_value',
                                             columns = 'which_var').reset_index()

    ## do the division
    rate_varname = varname_clean + '_rate'
    df_wide[rate_varname] = df_wide[var_pattern]/df_wide[base_name_raw]
    #print(df_wide.head())
    
    ## return cleaned data
    return(df_wide[id_col + [rate_varname] + ['which_year']])
        

In [4]:
## constants
base_path = "/Users/raj2/Dropbox/dph_hearing_decisions/"

'/Users/raj2/Dropbox/dph_hearing_decisions/'

## 1. Load and do prelim cleaning of filings data

In [7]:
dc_filings_init = pd.read_csv(base_path + "data/dc/intermediate/processed_filings.csv")

dc_filings_init['failed_parse'] = np.where(dc_filings_init.eq(dc_filings_init.iloc[:, 0], 
                                axis=0).all(1), 1, 0)


## get row number of those that failed parse to reprocess
rownums_failedparse = pd.DataFrame({'missing_info':
                dc_filings_init.loc[dc_filings_init.failed_parse == 1].index.tolist()})

## write those and go back to process tables, pulling all cols for those rows
rownums_failedparse.to_pickle(base_path + "data/dc/intermediate/rownums_failedparse.pickle")


## subset to ones that parsed
dc_filings = dc_filings_init.loc[dc_filings_init.failed_parse == 0, ].copy()

"""After removing those that failed to parse, go from {} filings to {} filings.
""".format(dc_filings_init.shape[0],
          dc_filings.shape[0])



'After removing those that failed to parse, go from 7949 filings to 7752 filings.\n'

In [8]:
dc_filings['casetype_clean_init'] = [process_type(one_type) for one_type in dc_filings.casetype.tolist()]
dc_filings['casetype_isdigits'] = ["digits" if re.match(r'[0-9]+', one_str) is not None  else "no_digits" 
        for one_str in dc_filings.casetype_clean_init]

## by subsetting to those, see that year is still in the case so don't need to use for that
dc_filings['casetype_final'] = np.where((dc_filings.casetype_clean_init.str.contains("Discip")) |
                                        (dc_filings.casetype_clean_init.str.contains("Expedited")),
                                        "Expedited Discipline",
                                np.where((dc_filings.casetype_clean_init.str.contains("LEA")) & 
                                         (dc_filings.casetype_clean_init != "By LEA"), "Against LEA",
                                np.where(dc_filings.casetype_clean_init == "By LEA", "By LEA",
                                np.where(dc_filings.casetype_clean_init.str.contains("Against SE"),
                                        "Against SEA",
                                        "Other/failed to parse"))))


## write the failed to parse ones
## write those and go back to process tables, pulling the rows manually
dc_filings[dc_filings.casetype_final == "Other/failed to parse"].to_csv(base_path + "data/dc/intermediate/missing_casetype.csv")


## get range of dates of the filings
dc_filings['year_init'] = [re.sub(r'^(20[1-2][0-9]).*', r'\1', str(one_string)) for one_string in 
                      dc_filings.case_no]
year_range = [str(i) for i in np.arange(2012, 2020).tolist()]
dc_filings['year'] = np.where(dc_filings.year_init.isin(year_range), dc_filings.year_init,
                             'failed_toparse')
dc_filings.year.value_counts() # half the year in 2019

## 

2012              1651
2013              1459
2014              1023
2015              1001
2018               834
2017               702
2016               626
2019               269
failed_toparse     187
Name: year, dtype: int64

## 2. Merge in district demographic data

### 2.1 Create name-nces ID crosswalk

In [624]:
## 75-col limit in export-- first 75 cols
cc_data_1 = pd.read_csv(base_path + "data/dc/intermediate/dc_ccd.csv")

## 
cc_data_2 = pd.read_csv(base_path + "data/dc/intermediate/dc_ccd_pull2.csv",
                       encoding= 'unicode_escape')

## find overlapping cols
cc_data_1_topull = cc_data_1.columns.difference(cc_data_2.columns).tolist() + \
                    ["School Name", 
                     "School ID - NCES Assigned [Public School] Latest available year"]


In [628]:
## merge excluding
cc_data_merged = pd.merge(cc_data_1[cc_data_1_topull], 
                          cc_data_2, 
                          on = "School Name",
                         how = "left")


291

In [633]:
cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in cc_data_merged.columns]
cc_data_merged.columns = cc_cleancols


Index(['AGENCY_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR',
       'AGENCY_NAME__PUBLIC_SCHOOL__2012-13',
       'AGENCY_NAME__PUBLIC_SCHOOL__2013-14',
       'AGENCY_NAME__PUBLIC_SCHOOL__2014-15',
       'AGENCY_NAME__PUBLIC_SCHOOL__2015-16',
       'AGENCY_NAME__PUBLIC_SCHOOL__2016-17',
       'AGENCY_NAME__PUBLIC_SCHOOL__2017-18',
       'DIRECT_CERTIFICATION__PUBLIC_SCHOOL__2016-17',
       'DIRECT_CERTIFICATION__PUBLIC_SCHOOL__2017-18',
       'FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2012-13',
       ...
       'PUPIL/TEACHER_RATIO__PUBLIC_SCHOOL__2015-16',
       'PUPIL/TEACHER_RATIO__PUBLIC_SCHOOL__2014-15',
       'PUPIL/TEACHER_RATIO__PUBLIC_SCHOOL__2013-14',
       'PUPIL/TEACHER_RATIO__PUBLIC_SCHOOL__2012-13',
       'FULL-TIME_EQUIVALENT__FTE__TEACHERS__PUBLIC_SCHOOL__2017-18',
       'FULL-TIME_EQUIVALENT__FTE__TEACHERS__PUBLIC_SCHOOL__2016-17',
       'FULL-TIME_EQUIVALENT__FTE__TEACHERS__PUBLIC_SCHOOL__2015-16',
       'FULL-TIME_EQUIVALENT__FTE__TEACHERS__PUBL

In [634]:


## create crosswalk to do matching
cc_crosswalk = cc_data_merged[['SCHOOL_NAME', 
                        'AGENCY_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR']].copy().drop_duplicates()


cc_crosswalk['name_tocompare'] = [replace_schooltype(one_school) for one_school in cc_crosswalk.SCHOOL_NAME]



## 2.2 Clean school name and fuzzy matching to IDs

In [636]:
## preprocess school to clean
## and do fuzzy matching
dc_filings_tomatch = dc_filings[['case_no', 'dcps_school_against', 'year', 'casetype_final']].drop_duplicates(subset = 
                                                ['case_no',
                                                'dcps_school_against'])



dc_filings_tomatch['school_against_cleaned_1'] = [process_schoolname(one_name) 
                                                for one_name in dc_filings_tomatch.dcps_school_against]
dc_filings_tomatch['school_against_cleaned'] = [replace_schooltype(one_name)
                                               for one_name in dc_filings_tomatch.school_against_cleaned_1]



## generate tf-idf representation
filings_crosswalk = dc_filings_tomatch[['school_against_cleaned']].drop_duplicates()
filings_crosswalk['id'] = filings_crosswalk.index+1


## write to intermediate
cc_crosswalk.to_csv(base_path + "data/dc/intermediate/nces_schoolnames.csv")
filings_crosswalk.to_csv(base_path + "data/dc/intermediate/filings_names.csv")


id_rename_dict = {'AGENCY_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR': 
                      'nces_id'}



In [476]:
## Ran script: 03helper_fuzzymatch_nces.csv


Unnamed: 0,school_against_cleaned,id
0,NAN,1
1,PROSPECT LC,2
5,HEIGHTS EDUCATION CENTER HIGH SCHOOL,6
7,JOHNSON MIDDLE SCHOOL,8
9,EASTERN HIGH SCHOOL,10


### 2.3 merge in fuzzy match results

In [637]:
## load results of fmatch
fm_cc_filings = pd.read_csv(base_path + "data/dc/intermediate/nces_filings_fuzzymatch.csv")



## merge back using original name
filings_crosswalk_wmatch = pd.merge(filings_crosswalk, 
                                   fm_cc_filings[['original_name', 'matched_name', 'score']],
                                   left_on = 'school_against_cleaned',
                                   right_on = 'original_name',
                                   how = "left")

filings_crosswalk_wmatch['matched'] = np.where(filings_crosswalk_wmatch.score.notnull(), 1, 0)

## write the non-matched ones to csv and deal with later
## filtered out "other" and "non-attending"
filings_crosswalk_wmatch.loc[filings_crosswalk_wmatch.matched == 0].to_csv(base_path + "data/dc/intermediate/nonmatch_schoolname.csv")

## manually matched them 

manualmatch = pd.read_csv(base_path + "data/dc/intermediate/manual_nonmatch_dc.csv")
manualmatch['matched_manually'] = np.where(manualmatch.matched_name.notnull(), 1, 0)
manualmatch.matched_manually.value_counts()

manualmatch_ccd = manualmatch.loc[(manualmatch.source == "ccd") & 
                                 (manualmatch.multiple == 0) & 
                                 (manualmatch.matched_manually == 1)].copy().drop_duplicates() # filters out 
                                    # ones that matched to crdc and multi-campus pcs
manualmatch_ccd['score'] = np.nan

## merge with filings 
filings_crosswalk_wmanualmatch = pd.merge(filings_crosswalk, 
                                   manualmatch_ccd,
                                   left_on = 'school_against_cleaned',
                                   right_on = 'original_name',
                                   how = "left")

filings_crosswalk_wmanualmatch['matched_manually'] = np.where(filings_crosswalk_wmanualmatch.matched_name.notnull(),
                                        1, 0)

filings_crosswalk_wmanualmatch_matched = filings_crosswalk_wmanualmatch.loc[filings_crosswalk_wmanualmatch.matched_manually == 1].copy()


## rowbind into one crosswalk
filings_crosswalk_fuzzy_matched = filings_crosswalk_wmatch.loc[filings_crosswalk_wmatch.matched == 1].copy()

## Combined crosswalk
filings_crosswalk_both = pd.concat([filings_crosswalk_fuzzy_matched[['school_against_cleaned',
                                                                    'original_name',
                                                                    'matched_name','score']],
                                   filings_crosswalk_wmanualmatch_matched[['school_against_cleaned',
                                                                    'original_name',
                                                                    'matched_name','score']]])

filings_crosswalk_both['type_match'] = np.where(filings_crosswalk_both.score.notnull(),
                                               "fuzzy",
                                               "manual")


## write filings not in crosswalk
filings_crosswalk_notmatched = filings_crosswalk.loc[~filings_crosswalk.school_against_cleaned.isin(fm_cc_filings.original_name.tolist() +
                            manualmatch.original_name[manualmatch.matched_manually == 1].tolist())].copy()

#print(filings_crosswalk_notmatched[['school_against_cleaned']].sort_values(by = 
 #                                       "school_against_cleaned").to_latex(index = False))
    



1    249
0    109
Name: matched_manually, dtype: int64

In [644]:


## merge in nces ID based on matched name
filings_crosswalk_wid = pd.merge(filings_crosswalk_both,
                                cc_crosswalk,
                                 left_on = 'matched_name',
                                 right_on = 'name_tocompare',
                                 how = "left").drop_duplicates()

filings_crosswalk_wid.head()
filings_crosswalk_wid.rename(columns = id_rename_dict, inplace = True)
filings_crosswalk_tomerge = filings_crosswalk_wid[['nces_id', 'school_against_cleaned',
                                                  'name_tocompare', 
                                                  'SCHOOL_NAME']].copy()
filings_crosswalk_tomerge.rename(columns = {'SCHOOL_NAME': 'nces_name'},
                                inplace = True)


Unnamed: 0,school_against_cleaned,original_name,matched_name,score,type_match,SCHOOL_NAME,AGENCY_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR,name_tocompare
0,PROSPECT LC,PROSPECT LC,PROSPECT LC,100.0,fuzzy,PROSPECT LC,1100030.0,PROSPECT LC
1,JOHNSON MIDDLE SCHOOL,JOHNSON MIDDLE SCHOOL,JOHNSON MIDDLE SCHOOL,100.0,fuzzy,JOHNSON MS,1100030.0,JOHNSON MIDDLE SCHOOL
2,EASTERN HIGH SCHOOL,EASTERN HIGH SCHOOL,EASTERN HIGH SCHOOL,100.0,fuzzy,EASTERN HS,1100030.0,EASTERN HIGH SCHOOL
3,EASTE RN HIGH SCHOOL,EASTE RN HIGH SCHOOL,EASTERN HIGH SCHOOL,97.0,fuzzy,EASTERN HS,1100030.0,EASTERN HIGH SCHOOL
4,DUNBAR HIGH SCHOOL,DUNBAR HIGH SCHOOL,DUNBAR HIGH SCHOOL,100.0,fuzzy,DUNBAR HS,1100030.0,DUNBAR HIGH SCHOOL


In [645]:
filings_crosswalk_tomerge.head()

Unnamed: 0,nces_id,school_against_cleaned,name_tocompare,nces_name
0,1100030.0,PROSPECT LC,PROSPECT LC,PROSPECT LC
1,1100030.0,JOHNSON MIDDLE SCHOOL,JOHNSON MIDDLE SCHOOL,JOHNSON MS
2,1100030.0,EASTERN HIGH SCHOOL,EASTERN HIGH SCHOOL,EASTERN HS
3,1100030.0,EASTE RN HIGH SCHOOL,EASTERN HIGH SCHOOL,EASTERN HS
4,1100030.0,DUNBAR HIGH SCHOOL,DUNBAR HIGH SCHOOL,DUNBAR HS


### 2.4 merge with main case file

In [646]:
## merge back on to main data

dc_filings_wid = pd.merge(dc_filings_tomatch[['case_no',
                                             'dcps_school_against',
                                             'year',
                                             'school_against_cleaned',
                                             'casetype_final']],
                         filings_crosswalk_tomerge,
                         on = 'school_against_cleaned',
                         how = 'left')
dc_filings_wid.info()

## only able to match 60% to an nces id; seems most important troubleshooting is 
## improving crosswalk
dc_filings_wid['caseid_stripyear'] = dc_filings_wid.case_no.replace(r'20[1-9][0-9]\-(\-)?', '', regex = True)
dc_filings_wid['caseid_firstfiled'] = dc_filings_wid.caseid_stripyear.replace(r'\-(-)?.*', '', regex = True)
dc_filings_wid['caseid'] = dc_filings_wid.caseid_firstfiled.str.replace(' ', '')
dc_filings_wid_obsid = dc_filings_wid.loc[dc_filings_wid.nces_name.notnull()].copy()

# 86% match (still need to do multiple match ones etc)

n_original = dc_filings_wid.shape[0]
n_matched = dc_filings_wid_obsid.shape[0]

"""Of the {} unique school names in filings, {}, or {} proportion, were matched with an NCES ID
""".format(n_original, 
          n_matched,
          n_matched/n_original)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7295 entries, 0 to 7294
Data columns (total 8 columns):
case_no                   7293 non-null object
dcps_school_against       7294 non-null object
year                      7295 non-null object
school_against_cleaned    7295 non-null object
casetype_final            7295 non-null object
nces_id                   6258 non-null float64
name_tocompare            6258 non-null object
nces_name                 6258 non-null object
dtypes: float64(1), object(7)
memory usage: 512.9+ KB


'Of the 7295 unique school names in filings, 6258, or 0.8578478409869774 proportion, were matched with an NCES ID\n'

In [647]:
## remove missing and aggregate
dc_filings_valid = dc_filings_wid_obsid[['nces_id',
                                'caseid',
                                 'year',
                                'nces_name',
                                'casetype_final']].drop_duplicates()

dc_filings_valid['id_foragg'] = dc_filings_valid.caseid + dc_filings_valid.year



In [652]:
dc_filings_totalbyschool = dc_filings_valid.groupby(['nces_name',
                                                    'casetype_final', 
                                                    'year'])['id_foragg'].nunique().reset_index().sort_values(by =
                    'nces_name')

dc_filings_totalbyschool.rename(columns = {'id_foragg': 'count_filings'}, inplace = True)



NameError: name 'head' is not defined

# 3. Clean demographics

### 3.1: common core data

In [661]:

enrollment_vars = [col for col in cc_data_merged if 'TOTAL_STUDENTS' in col]
base_name_raw = 'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___PUBLIC_SCHOOL_'

id_col = "SCHOOL_NAME"

cc_data_merged.SCHOOL_NAME.head()

    
    

0                            ACADEMY OF HOPE ADULT PCS
1               ACHIEVEMENT PREPARATORY PCS ELEMENTARY
2            ACHIEVEMENT PREPARATORY PCS MIDDLE SCHOOL
3    ADAMS ELEMENTARY SCHOOL                       ...
4                                ADVANCED PATH ACADEMY
Name: SCHOOL_NAME, dtype: object

In [663]:
frpl_rate = aggregate_nces(var_pattern = "FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL_",
                              varname_clean = "frpl_eligible",
                          id_col = ['SCHOOL_NAME'])


which_var,SCHOOL_NAME,frpl_eligible_rate,which_year
1741,YOUTHBUILD PCS,0.964,_2013-14
1742,YOUTHBUILD PCS,0.78,_2014-15
1743,YOUTHBUILD PCS,0.377,_2015-16
1744,YOUTHBUILD PCS,,_2016-17
1745,YOUTHBUILD PCS,,_2017-18


In [666]:
race_enrollment_vars = [col for col in cc_data_merged.columns if "TOTAL_RACE" in col]
black_rate = aggregate_nces(var_pattern = "BLACK_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "black",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['SCHOOL_NAME'])
white_rate = aggregate_nces(var_pattern = "WHITE_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "white",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['SCHOOL_NAME'])
hisp_rate = aggregate_nces(var_pattern = "HISPANIC_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "hispanic",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                          id_col = ['SCHOOL_NAME'])

In [670]:
## merge into one df

dfs = [df.set_index(['SCHOOL_NAME', 
                     'which_year']) for df in [frpl_rate, black_rate, white_rate, hisp_rate]]

cc_dem_rates = pd.concat(dfs, axis=1).reset_index()
cc_dem_rates.rename(columns = {'SCHOOL_NAME': 
                      'nces_name'}, inplace = True)


## use demographics at baseline-ish
## maybe exclude 2012

## for each id, could how 



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 6 columns):
nces_name             1746 non-null object
which_year            1746 non-null object
frpl_eligible_rate    1124 non-null float64
black_rate            1090 non-null float64
white_rate            1014 non-null float64
hispanic_rate         1085 non-null float64
dtypes: float64(4), object(2)
memory usage: 81.9+ KB


### 3.2 CCD data on ieps


In [519]:
## read in iep data 
crdc = pd.read_csv(base_path + "data/dc/intermediate/EducationDataPortal_03.07.2020_disability.csv")

In [706]:
## aggregate
iep_summary = aggregate_crdc(var_col = "disability", 
                            value_col = "enrollment_crdc")


In [None]:
## read and clean discipline data

In [534]:
crdc_largerpull = pd.read_csv(base_path + "data/dc/intermediate/EducationDataPortal_03.08.2020_schools.csv")


## fill NA with 0
crdc_largerpull_fill = crdc_largerpull.fillna(0)


Index(['year', 'ncessch', 'school_name', 'state_name', 'lea_name',
       'school_level', 'school_type', 'charter', 'free_lunch',
       'reduced_price_lunch', 'free_or_reduced_price_lunch', 'enrollment',
       'direct_certification', 'enrollment_crdc', 'students_susp_in_sch',
       'students_susp_out_sch_single', 'students_susp_out_sch_multiple',
       'expulsions_no_ed_serv', 'expulsions_with_ed_serv',
       'expulsions_zero_tolerance', 'students_corporal_punish',
       'students_arrested', 'students_referred_law_enforce',
       'students_mech_restraint', 'students_phys_restraint',
       'students_seclusion'],
      dtype='object')

In [557]:
## discipline
discipline_cols = [col for col in crdc_largerpull_fill.columns if 
                  "susp" in col or "expulsions" in col or "corporal" in col]
restr_secl_cols = [col for col in crdc_largerpull_fill.columns if 
                  "restraint" in col or "seclusion" in col]

crdc_largerpull_fill['total_discipline'] = crdc_largerpull_fill[discipline_cols].sum(axis = 1)
crdc_largerpull_fill['total_restraint_seclude'] = crdc_largerpull_fill[restr_secl_cols].sum(axis = 1)

In [711]:
crdc_disc_foragg = crdc_largerpull_fill.loc[~crdc_largerpull_fill.enrollment.isin(['0',
                                        'Not applicable']),
                                        ['school_name', 'year',
                                        'enrollment',
                                        'total_discipline']].copy()
crdc_disc_foragg.rename(columns = {'enrollment':
            'total'}, inplace = True)

crdc_disc_foragg.head()


Unnamed: 0,school_name,year,total,total_discipline
0,Cesar Chavez PCS for Public Policy Capitol Hill,2016,332,0.0
1,Cesar Chavez PCS for Public Policy Parkside Mi...,2016,278,0.0
2,Cesar Chavez PCS for Public Policy Chavez Prep,2016,306,0.0
3,Cesar Chavez PCS for Public Policy Parkside High,2016,359,0.0
4,Friendship PCS Collegiate Academy,2016,751,0.0


In [727]:
disc_summary = aggregate_crdc(data = crdc_disc_foragg,
                             var_col = "total_discipline", 
                            value_col = "total_discipline",
                             format = "wide")

crdc_res_foragg = crdc_largerpull_fill.loc[~crdc_largerpull_fill.enrollment.isin(['0',
                                        'Not applicable']),
                                        ['school_name', 'year',
                                        'enrollment',
                                        'total_restraint_seclude']].copy()
crdc_res_foragg.rename(columns = {'enrollment':
            'total'}, inplace = True)

res_summary = aggregate_crdc(data = crdc_res_foragg,
                             var_col = "total_restraint_seclude", 
                            value_col = "total_restraint_seclude",
                             format = "wide")



res_summary.head()



Unnamed: 0,SCHOOL_NAME,YEAR,TOTAL,TOTAL_RESTRAINT_SECLUDE,TOTAL_RESTRAINT_SECLUDE_rate
751,ARTS AND TECHNOLOGY PCS,2013,618,0.0,0.0
752,BOOKER T WASHINGTON PCS,2013,177,0.0,0.0
754,CESAR CHAVEZ FOR PUBLIC POLICY CAPITOL HILL PCS,2013,389,0.0,0.0
755,CESAR CHAVEZ PCS FOR PUBLIC POLICY-PARKSIDE HS,2013,305,0.0,0.0
756,CESAR CHAVEZ PCS FOR PUBLIC POLICY CHAVEZ PREP,2013,322,0.0,0.0


### 3.3 merge the diff crdc data

In [730]:
res_summary.rename(columns = {'TOTAL': 'total_students_ressec_data'},
                  inplace = True)

disc_summary.rename(columns = {'TOTAL': 'total_students_disc_data'},
                  inplace = True)

iep_summary.rename(columns = {'TOTAL': 'total_students_iep_data'},
                  inplace = True)

res_exclude_year = [col for col in res_summary.columns if col != "YEAR"]
disc_exclude_year = [col for col in disc_summary.columns if col != "YEAR"]




In [732]:

## merge all three crdc 
dfs_crdc = [df.set_index(['SCHOOL_NAME']) for df in [iep_summary, 
                                                    res_summary[res_exclude_year],
                                                    disc_summary[disc_exclude_year]]]

crdc_rates_tomerge  = pd.concat(dfs_crdc, axis=1).reset_index().rename(columns = {'index': 'nces_name'})



Unnamed: 0,nces_name,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_SECTION_504,total_students_iep_data,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_SECTION_504_rate,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA_rate,total_students_ressec_data,TOTAL_RESTRAINT_SECLUDE,TOTAL_RESTRAINT_SECLUDE_rate,total_students_disc_data,TOTAL_DISCIPLINE,TOTAL_DISCIPLINE_rate
0,ACHIEVEMENT PREPARATORY ACADEMY PCS,,7.0,355.0,0.02,,382.0,0.0,0.0,382.0,205.0,0.537
1,ACHIEVEMENT PREPARATORY PCS-ELEMENTARY,,2.0,222.0,0.009,,233.0,0.0,0.0,233.0,20.0,0.086
2,AITON ES,26.0,4.0,249.0,0.016,0.104,247.0,0.0,0.0,247.0,57.0,0.231
3,AMIDON BOWEN ES,50.0,2.0,341.0,0.006,0.147,342.0,0.0,0.0,342.0,69.0,0.202
4,ANACOSTIA SHS,224.0,7.0,766.0,0.009,0.292,751.0,0.0,0.0,751.0,338.0,0.45


## 4. Aggregate and merge with complaints data

Count of complaints 2014 onwards --- 2013-2014 demographics

In [695]:
years_agg = ["2014", "2015", "2016", "2017", "2018", "2019"]
dc_filings_postdem = dc_filings_totalbyschool.loc[dc_filings_totalbyschool.year.isin(years_agg)].groupby(['nces_name',
                                        'casetype_final']).agg({'count_filings': 'sum'}).reset_index()



cc_dem_rates_20132014 = cc_dem_rates.loc[cc_dem_rates.which_year == "_2013-14"].copy()


## reshape filings to wide
dc_filings_postdem_wide = pd.pivot_table(dc_filings_postdem,
                                        index = ['nces_name'],
                                        columns = ['casetype_final'],
                                        values = 'count_filings').reset_index()

dc_filings_postdem_wide.columns = [re.sub("\s", "_", col.upper())
                                  for col in dc_filings_postdem_wide.columns]




In [743]:
## na indicates there were not cases of a particular type
## so filling in with 0
dc_filings_postdem_tomerge = dc_filings_postdem_wide[['NCES_NAME',
                                                    'AGAINST_LEA',
                                                    'AGAINST_SEA',
                                                    'BY_LEA',
                                                     'EXPEDITED_DISCIPLINE']].fillna(0)

## left join with common core demographics
cc_dem_rates_wcase = pd.merge(cc_dem_rates_20132014,
                             dc_filings_postdem_tomerge,
                             left_on = 'nces_name',
                             right_on = 'NCES_NAME',
                             how = "left")

## 0 = indicates no cases
case_vars = [col for col in cc_dem_rates_wcase.columns if 
            "LEA" in col or "AGAINST" in col or "EXPEDITED" in col]

cc_dem_rates_wcase[case_vars] = cc_dem_rates_wcase[case_vars].fillna(0)
cc_dem_rates_wcase['in_filings'] = np.where(cc_dem_rates_wcase.NCES_NAME.isnull(), 
                                           0, 1)


## look at overlap
names_shared = set(cc_dem_rates_wcase.nces_name).intersection(crdc_rates_tomerge.nces_name)
names_ccd_notcr = set(cc_dem_rates_wcase.nces_name).difference(crdc_rates_tomerge.nces_name)
#names_ccd_notcr

names_cr_notccd = set(crdc_rates_tomerge.nces_name).difference(cc_dem_rates_wcase.nces_name)

## do fuzzy matching and then merge in ccd demographics
## after that merge
## for now, just use frpl data

cc_notmatched = cc_dem_rates_wcase.loc[~cc_dem_rates_wcase.nces_name.isin(names_ccd_notcr)].copy()
cc_notmatched.to_csv(base_path + "data/dc/intermediate/commoncore_tomatch.csv",
                    index = False)



cr_notmatched = crdc_rates_tomerge.loc[~crdc_rates_tomerge.nces_name.isin(names_cr_notccd)].copy()
cr_notmatched.to_csv(base_path + "data/dc/intermediate/ccd_pool.csv",
                    index = False)


cc_notmatched.nces_name.head()

5                                             AITON ES
10    APPLETREE EARLY LEARNING CENTER PCS LINCOLN PARK
15                             ARTS AND TECHNOLOGY PCS
19                                          BARNARD ES
20                                        BASIS DC PCS
Name: nces_name, dtype: object

In [None]:
## where things left off:
## - fuzzy matching nces and ccd
## next steps:
## - load in matched data
## - get better ids
## - do left join on nces and hopefully more schools also have ccd data