## 0. Imports and functions

In [1]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time

## plotting
from plotnine import *

## dates
from dateutil.relativedelta import relativedelta



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    
    return(clean_lea)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



def replace_missing_nces(one_val):
    
    if one_val.isdigit():
        final_val = one_val
    else:
        final_val = np.nan
    return(final_val)

def aggregate_crdc(var_col, value_col, 
                  data,
                  id_col = "school_name",
                  year_chosen = 2013, format = "long"):
    
    
    ## shape from long to wide
    if(format == "long"):
        df_wide = pd.pivot_table(data.loc[data.year == year_chosen,
                    [id_col,
                    var_col, 
                    value_col]],
                    index  = id_col, 
                    values = value_col,
                    columns = var_col).reset_index()

    
    else:
        df_wide = data.loc[data.year == year_chosen].copy()
        
    ## standardize columns
    df_wide.columns = [re.sub("\s+", "_", col.upper()) 
                           for col in df_wide.columns]
        
    ## generate rates
    sub_cols = set(df_wide.columns).difference(["SCHOOL_NAME", "TOTAL", 'YEAR'])
    
    ## 
    for col in sub_cols:
        df_wide[col] = pd.to_numeric(df_wide[col])
        df_wide['TOTAL'] = pd.to_numeric(df_wide['TOTAL'])
        df_wide['{}_rate'.format(col)] = df_wide[col]/df_wide['TOTAL']
    
    ## return
    return(df_wide)
    
def aggregate_nces(var_pattern, varname_clean, id_col, 
                       cc_data_merged,
                      enrollment_vars, base_name_raw):
    
    dem_vars = [col for col in cc_data_merged if var_pattern in col]
    long_df = pd.melt(cc_data_merged[dem_vars + enrollment_vars + id_col],
                       id_vars = id_col)
    
    ## create year versus dem col
    long_df['clean_value_1'] = [replace_missing_nces(val) for val in long_df.value]
    long_df['clean_value'] = pd.to_numeric(long_df.clean_value_1)
    long_df['which_var'] = long_df.variable.astype(str).str.replace("\\_20.*", "")
    replace_pattern = "|".join(long_df.which_var.unique())
    long_df['which_year'] = [re.sub(replace_pattern, "", one_var) for one_var in long_df.variable]
    long_toagg = long_df[id_col + ['which_var', 'which_year',
                                           'clean_value']].reset_index()

    ## do the aggregation 
    index_cols = id_col + ["which_year"]
    df_wide = long_toagg.pivot_table(index  = index_cols,
                                             values = 'clean_value',
                                             columns = 'which_var').reset_index()

    ## do the division
    rate_varname = varname_clean + '_rate'
    df_wide[rate_varname] = df_wide[var_pattern]/df_wide[base_name_raw]
    #print(df_wide.head())
    
    ## return cleaned data
    return(df_wide[id_col + [rate_varname] + ['which_year']])


def clean_next_row(one_row):
    
    ## convert to correct type
    if type(one_row) != str:
        clean_row = str(one_row)
    else:
        clean_row = one_row
        
    ## extract correct pattern    
    if bool(re.search(r'^-', str(clean_row))) == True:
        isd = re.sub(r'^-', '', str(clean_row))
    
    elif bool(re.search(r'\(', clean_row)) == True:
        isd = re.sub(r'.*\(([0-9]+)\).*', r'\1', clean_row)
        
    else:
        isd = np.nan
        
    ## pad 0's
    if type(isd) == str and len(isd) == 5:
        isd = "0" + isd
        
    return(isd)
        

  from pandas.core import datetools


In [4]:
## constants
base_path = "/Users/raj2/Dropbox/dph_hearing_decisions/"

## 1. Load and do prelim cleaning of filings data

In [97]:
ohio_filings_init = pd.read_excel(base_path + "data/ohio/raw_filings/ohio_raw.xlsx")

name_cols = [re.sub("\s+|\/", "_", col.lower()) for col in ohio_filings_init.columns]
ohio_filings_init.columns = name_cols

## get
"""Out of {} total filings, there are {} unique districts
""".format(ohio_filings_init.shape[0],
          len(ohio_filings_init.district_community_school.unique()))

## next steps:
## nces export
## fuzzy matching of district name
ohio_filings_init.head()

## get school names
ohio_filings_init['district_name_cap'] = ohio_filings_init.district_community_school.astype(str).str.upper()


'Out of 859 total filings, there are 265 unique districts\n'

Unnamed: 0,case_#,status,status_reason,received,closed,complainant,student,district_community_school
0,SE 3782-2019,Closed,Complainant Withdrew (CW),6/13/2019,2019-06-27 00:00:00,,,Cleveland Municipal
1,SE 3775-2019,Closed,Complainant Withdrew (CW),5/31/2019,2019-06-27 00:00:00,,,Cleveland Municipal
2,SE 3771-2019,Closed,Dismissed by IHO (CDIS),5/24/2019,2019-07-08 00:00:00,,,Cleveland Municipal
3,SE 3766-2019,Closed,Dismissed by IHO (CDIS),5/14/2019,2019-06-12 00:00:00,,,Berkshire Local
4,SE 3759-2019,Closed,Withdrawn After Resolution Meeting (CR),4/16/2019,2019-04-26 00:00:00,,,Mansfield City


Unnamed: 0,case_#,status,status_reason,received,closed,complainant,student,district_community_school,district_name_cap
0,SE 3782-2019,Closed,Complainant Withdrew (CW),6/13/2019,2019-06-27 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL
1,SE 3775-2019,Closed,Complainant Withdrew (CW),5/31/2019,2019-06-27 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL
2,SE 3771-2019,Closed,Dismissed by IHO (CDIS),5/24/2019,2019-07-08 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL
3,SE 3766-2019,Closed,Dismissed by IHO (CDIS),5/14/2019,2019-06-12 00:00:00,,,Berkshire Local,BERKSHIRE LOCAL
4,SE 3759-2019,Closed,Withdrawn After Resolution Meeting (CR),4/16/2019,2019-04-26 00:00:00,,,Mansfield City,MANSFIELD CITY


Unnamed: 0,case_#,status,status_reason,received,closed,complainant,student,district_community_school,district_name_cap
153,SE 3604-2018,Closed,Withdrawn After Resolution Meeting (CR),5/2/2018,2018-05-14 00:00:00,,,Global Ambassadors Language Academy,GLOBAL AMBASSADORS LANGUAGE ACADEMY


# 2. Merge with nces crosswalk'

- Next steps-- look at overlap

- Troubleshoot non-overlap

- Merge with crosswalk then with nces district-level demographics 

In [115]:
ohio_ccd = pd.read_csv(base_path + "data/ohio/intermediate/ohio_ccd.csv",
                      encoding= 'unicode_escape')


cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in ohio_ccd.columns]
ohio_ccd.columns = cc_cleancols


## create crosswalk
ccd_crosswalk = ohio_ccd[['AGENCY_NAME',
                            'AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR']]

## look at direct overlap
ohio_exact = set(ccd_crosswalk.AGENCY_NAME).intersection(ohio_filings_init.district_name_cap.unique())

## first, merge crosswalk using exact match
ohio_filings_exact = pd.merge(ohio_filings_init,
                             ccd_crosswalk,
                              left_on = "district_name_cap",
                             right_on = "AGENCY_NAME",
                             how = "left")
ohio_filings_nonmatch = ohio_filings_exact.loc[ohio_filings_exact['AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR'].isnull()].copy()


## write to match manually
ohio_filings_nonmatch_districts = pd.DataFrame({'filing_district': ohio_filings_nonmatch.district_name_cap.unique(),
                                               'nces_name': ""})


## read in matches
ohio_manual = pd.read_csv(base_path + "/data/ohio/intermediate/ohio_manualmatch.csv")
districts_toaverage = ohio_manual.loc[ohio_manual.nces_id.astype(str).str.contains("average"),
                                     "nces_name"]
districts_matched = ohio_manual.loc[(ohio_manual.nces_id.notnull()) & 
                                   (~ohio_manual.nces_name.isin(districts_toaverage))].copy()



## match the ones with nces id obtain
ohio_filings_exact_matched =  ohio_filings_exact.loc[ohio_filings_exact['AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR'].notnull()].copy()
id_col = 'AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR'
ohio_filings_nonmatch.drop(columns = id_col, inplace = True)

## merge using district name
ohio_filings_nonmatch_wid = pd.merge(ohio_filings_nonmatch,
                                   districts_matched,
                                    left_on = "district_name_cap",
                                    right_on = "nces_name",
                                    how = "left")

## rename column and row bind
ohio_filings_nonmatch_wid.rename(columns = {'nces_id': id_col}, inplace = True)
ohio_filings_nonmatch_wid_tobind = ohio_filings_nonmatch_wid.drop(columns = "nces_name", inplace = False)

## rowbind
ohio_filings_tomerge = pd.concat([ohio_filings_exact_matched,
                                 ohio_filings_nonmatch_wid_tobind])



# 3. Clean nces demographics

### 3.1 Read in and clean colnames

In [133]:
## rename id col
ohio_ccd.rename(columns = {'AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR':
                                'nces_id'}, inplace = True)
ohio_ccd_2 = pd.read_csv(base_path + "data/ohio/intermediate/ohio_ccd_2.csv",
                        encoding= 'unicode_escape')

cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in ohio_ccd_2.columns]
cc_clean2 = ["nces_id" if col == id_col else col for col in cc_cleancols]
ohio_ccd_2.columns = cc_clean2


Unnamed: 0,AGENCY_NAME,STATE_NAME__DISTRICT__LATEST_AVAILABLE_YEAR,nces_id,LATITUDE__DISTRICT__2013-14,LONGITUDE__DISTRICT__2013-14,FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2013-14
0,A+ ARTS ACADEMY,Ohio,3900305,39.967,-82.9052,0.0
1,A+ CHILDREN'S ACADEMY,Ohio,3901480,39.8823,-82.9979,87.0
2,A.B. GRAHAM ACADEMY,Ohio,3901358,40.1277,-83.9532,0.0
3,ACADEMY FOR URBAN SCHOLARS YOUNGSTOWN,Ohio,3901472,41.1182,-80.652,0.0
4,ACADEMY OF COLUMBUS,Ohio,3900438,40.0629,-82.9663,


In [146]:
cc_data_merged = pd.merge(ohio_ccd,
                          ohio_ccd_2.drop(columns = ["AGENCY_NAME"]),
                          on = "nces_id")


### 3.2: calculate enrollment percentages

In [139]:

enrollment_vars = [col for col in cc_data_merged if 'TOTAL_STUDENTS' in col]
base_name_raw = 'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT_'



In [140]:
frpl_rate = aggregate_nces(var_pattern = "FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL_",
                              varname_clean = "frpl_eligible",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged,
                          enrollment_vars = enrollment_vars,
                          base_name_raw = base_name_raw)


In [141]:
race_enrollment_vars = [col for col in cc_data_merged.columns if "TOTAL_RACE" in col]
race_enrollment_vars

cc_data_merged[[col for col in cc_data_merged.columns if "BLACK" in col]].head()

['TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2013-14']

Unnamed: 0,BLACK_STUDENTS__DISTRICT__2013-14
0,348.0
1,34.0
2,1.0
3,73.0
4,


In [90]:
cc_data_merged.shape

(1328, 145)

In [142]:
black_rate = aggregate_nces(var_pattern = "BLACK_STUDENTS__DISTRICT_",
                              varname_clean = "black",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['nces_id'],
                           cc_data_merged = cc_data_merged)
white_rate = aggregate_nces(var_pattern = "WHITE_STUDENTS__DISTRICT_",
                              varname_clean = "white",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['nces_id'],
                           cc_data_merged = cc_data_merged)
hisp_rate = aggregate_nces(var_pattern = "HISPANIC_STUDENTS__DISTRICT_",
                              varname_clean = "hispanic",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged)

In [143]:
[col for col in cc_data_merged.columns if "INDIVIDUALIZED" in col]

['INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2013-14']

In [144]:
iep_rate = aggregate_nces(var_pattern = "INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT_",
                              varname_clean = "iep",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged,
                          enrollment_vars = enrollment_vars,
                          base_name_raw = base_name_raw)

In [145]:
## merge into one df

dfs = [df.set_index(['nces_id', 
                     'which_year']) for df in [frpl_rate, black_rate, white_rate, hisp_rate,
                                              iep_rate]]

cc_dem_rates = pd.concat(dfs, axis=1).reset_index()


## 4. Aggregate and merge with complaints data

Count of complaints 2014 onwards --- 2013-2014 demographics

In [166]:
ohio_filings_tomerge.columns = ["nces_id" if col == id_col else col for col in ohio_filings_tomerge.columns]

In [167]:
ohio_filings_tomerge['year_request'] = [re.sub(r'[1-9][0-9]?/[1-9][0-9]?/', "", 
                                                received) for received in ohio_filings_tomerge.received]


In [169]:
ohio_filings_tomerge.head()

ohio_filings_toagg = ohio_filings_tomerge.loc[ohio_filings_tomerge.nces_id.notnull()].copy()

Unnamed: 0,case_#,status,status_reason,received,closed,complainant,student,district_community_school,district_name_cap,AGENCY_NAME,nces_id,year_request
0,SE 3782-2019,Closed,Complainant Withdrew (CW),6/13/2019,2019-06-27 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL,CLEVELAND MUNICIPAL,3904378.0,2019
1,SE 3775-2019,Closed,Complainant Withdrew (CW),5/31/2019,2019-06-27 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL,CLEVELAND MUNICIPAL,3904378.0,2019
2,SE 3771-2019,Closed,Dismissed by IHO (CDIS),5/24/2019,2019-07-08 00:00:00,,,Cleveland Municipal,CLEVELAND MUNICIPAL,CLEVELAND MUNICIPAL,3904378.0,2019
3,SE 3766-2019,Closed,Dismissed by IHO (CDIS),5/14/2019,2019-06-12 00:00:00,,,Berkshire Local,BERKSHIRE LOCAL,BERKSHIRE LOCAL,3904716.0,2019
4,SE 3759-2019,Closed,Withdrawn After Resolution Meeting (CR),4/16/2019,2019-04-26 00:00:00,,,Mansfield City,MANSFIELD CITY,MANSFIELD CITY,3904429.0,2019


In [170]:
years_agg = ["2014", "2015", "2016", "2017", "2018"]
ohio_filings_agg = ohio_filings_toagg.loc[ohio_filings_toagg.year_request.isin(years_agg)].groupby(['nces_id',
                                                                    'year_request']).agg({'case_#': 
                                           lambda x: x.nunique()}).reset_index()




ohio_filings_agg.columns = ['nces_id', 'year', 'count_filings']


In [171]:
ohio_filings_agg.head()

Unnamed: 0,nces_id,year,count_filings
0,3900017.0,2017,2
1,3900027.0,2017,1
2,3900032.0,2018,1
3,3900036.0,2016,1
4,3900038.0,2015,2


In [176]:
ohio_ccd_2.head()

Unnamed: 0,AGENCY_NAME,STATE_NAME__DISTRICT__LATEST_AVAILABLE_YEAR,nces_id,LATITUDE__DISTRICT__2013-14,LONGITUDE__DISTRICT__2013-14,FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL__2013-14
0,A+ ARTS ACADEMY,Ohio,3900305,39.967,-82.9052,0.0
1,A+ CHILDREN'S ACADEMY,Ohio,3901480,39.8823,-82.9979,87.0
2,A.B. GRAHAM ACADEMY,Ohio,3901358,40.1277,-83.9532,0.0
3,ACADEMY FOR URBAN SCHOLARS YOUNGSTOWN,Ohio,3901472,41.1182,-80.652,0.0
4,ACADEMY OF COLUMBUS,Ohio,3900438,40.0629,-82.9663,


In [184]:
ohio_ccd.head()

Unnamed: 0,AGENCY_NAME,STATE_NAME__DISTRICT__LATEST_AVAILABLE_YEAR,AGENCY_NAME__DISTRICT__2013-14,nces_id,TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT__2013-14,LIMITED_ENGLISH_PROFICIENT__LEP__/_ENGLISH_LANGUAGE_LEARNERS__ELL___DISTRICT__2013-14,INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2013-14,FREE_AND_REDUCED_LUNCH_STUDENTS__PUBLIC_SCHOOL__2013-14,HISPANIC_STUDENTS__DISTRICT__2013-14,BLACK_STUDENTS__DISTRICT__2013-14,WHITE_STUDENTS__DISTRICT__2013-14,TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2013-14
0,A+ ARTS ACADEMY,Ohio,A+ ARTS ACADEMY,3900305,383.0,0.0,32.0,0.0,11.0,348.0,5.0,383.0
1,A+ CHILDREN'S ACADEMY,Ohio,A+ CHILDREN'S ACADEMY,3901480,104.0,0.0,14.0,89.0,0.0,34.0,60.0,104.0
2,A.B. GRAHAM ACADEMY,Ohio,A.B. GRAHAM ACADEMY,3901358,274.0,0.0,24.0,0.0,0.0,1.0,261.0,274.0
3,ACADEMY FOR URBAN SCHOLARS YOUNGSTOWN,Ohio,ACADEMY FOR URBAN SCHOLARS YOUNGSTOWN,3901472,89.0,0.0,1.0,0.0,8.0,73.0,4.0,89.0
4,ACADEMY OF COLUMBUS,Ohio,ACADEMY OF COLUMBUS,3900438,,,,,,,,


In [187]:
cc_dem_rates_20132014 = cc_dem_rates.loc[cc_dem_rates.which_year == "_2013-14"].copy()

## add latitude and longitude
cc_dem_rates_wlat = pd.merge(cc_dem_rates_20132014,
                            ohio_ccd_2[['nces_id',
                                       'LATITUDE__DISTRICT__2013-14',
                                       'LONGITUDE__DISTRICT__2013-14']],
                            on = "nces_id",
                            how = "left")
cc_dem_rates_tomerge = pd.merge(cc_dem_rates_wlat, 
                               ohio_ccd[['nces_id',
                                        'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT__2013-14']],
                               on = 'nces_id',
                               how = "left")



TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT__2013-14    138110  00  0003233331101731792921571659364422...
dtype: object

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1196 entries, 0 to 1195
Data columns (total 10 columns):
nces_id                                                       1196 non-null int64
which_year                                                    1196 non-null object
frpl_eligible_rate                                            1003 non-null float64
black_rate                                                    1003 non-null float64
white_rate                                                    1003 non-null float64
hispanic_rate                                                 1003 non-null float64
iep_rate                                                      1003 non-null float64
LATITUDE__DISTRICT__2013-14                                   1196 non-null object
LONGITUDE__DISTRICT__2013-14                                  1196 non-null object
TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT__2013-14    1196 non-null object
dtypes: float64(5), int64(1), object(4)
memory usage: 102.8

In [178]:
## reshape filings to wide
ohio_filings_postdem_wide = pd.pivot_table(ohio_filings_agg,
                                        index = ['nces_id'],
                                        columns = ['year'],
                                        values = 'count_filings').reset_index()


ohio_filings_postdem_wide.columns = ["total_filings_" + str(col) if col in years_agg 
                                     else col for col in ohio_filings_postdem_wide.columns]
ohio_filings_postdem_wide.head()

Unnamed: 0,nces_id,total_filings_2014,total_filings_2015,total_filings_2016,total_filings_2017,total_filings_2018
0,3900017.0,,,,2.0,
1,3900027.0,,,,1.0,
2,3900032.0,,,,,1.0
3,3900036.0,,,1.0,,
4,3900038.0,,2.0,,,


In [179]:
## fill na with 0 since that indicates no filings that year
ohio_filings_postdem_wide = ohio_filings_postdem_wide.fillna(0)

In [188]:
## left join with common core demographics
cc_dem_rates_wcase = pd.merge(cc_dem_rates_tomerge,
                             ohio_filings_postdem_wide,
                             on = "nces_id",
                             how = "left")


## now fill na's for those cols with 0
filings_cols = [col for col in cc_dem_rates_wcase.columns
                if "filings" in col]
cc_dem_rates_wcase[filings_cols] = cc_dem_rates_wcase[filings_cols].fillna(0)
cc_dem_rates_wcase.head()


Unnamed: 0,nces_id,which_year,frpl_eligible_rate,black_rate,white_rate,hispanic_rate,iep_rate,LATITUDE__DISTRICT__2013-14,LONGITUDE__DISTRICT__2013-14,TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT__2013-14,total_filings_2014,total_filings_2015,total_filings_2016,total_filings_2017,total_filings_2018
0,3900001,_2013-14,0.428,0.268,0.652,0.043,0.08,40.0648,-83.0043,138.0,0.0,0.0,0.0,0.0,0.0
1,3900002,_2013-14,0.373,0.155,0.782,0.009,0.045,40.0691,-83.0188,110.0,0.0,0.0,0.0,0.0,0.0
2,3900005,_2013-14,,,,,,,,,0.0,0.0,0.0,0.0,0.0
3,3900006,_2013-14,,,,,,,,,0.0,0.0,0.0,0.0,0.0
4,3900008,_2013-14,,,,,,39.9441,-82.0037,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
## write to csv
cc_dem_rates_wcase.to_csv(base_path + "data/ohio/cleaned/filings_withdem.csv",
                  index = False)