# Clean and link filings



## 0. Imports and functions

In [1]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time

## plotting
from plotnine import *



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    clean_lea_ret = clean_lea.lower()
    
    return(clean_lea_ret)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



def replace_missing_nces(one_val):
    
    if one_val.isdigit():
        final_val = one_val
    else:
        final_val = np.nan
    return(final_val)

def aggregate_crdc(var_col, value_col, 
                  data,
                  id_col = ["school_name", "ncessch"],
                  year_chosen = 2013, format = "long"):
    
    
    ## shape from long to wide
    if(format == "long"):
        df_wide = pd.pivot_table(data.loc[data.year == year_chosen,
                    id_col + [var_col] + [value_col]],
                    index  = id_col, 
                    values = value_col,
                    columns = var_col).reset_index()

    
    else:
        df_wide = data.loc[data.year == year_chosen].copy()
        
    ## standardize columns
    df_wide.columns = [re.sub("\s+", "_", col.upper()) 
                           for col in df_wide.columns]
        
    ## generate rates
    sub_cols = set(df_wide.columns).difference(["SCHOOL_NAME", "TOTAL", 'YEAR',
                                               'NCESSCH'])
    
    ## 
    for col in sub_cols:
        df_wide[col] = pd.to_numeric(df_wide[col])
        df_wide['TOTAL'] = pd.to_numeric(df_wide['TOTAL'])
        df_wide['{}_rate'.format(col)] = df_wide[col]/df_wide['TOTAL']
    
    ## return
    return(df_wide)
    
def aggregate_nces(var_pattern, varname_clean, id_col, 
                       cc_data_merged,
                      enrollment_vars, base_name_raw):
    
    dem_vars = [col for col in cc_data_merged if var_pattern in col]
    long_df = pd.melt(cc_data_merged[dem_vars + enrollment_vars + id_col],
                       id_vars = id_col)
    
    ## create year versus dem col
    long_df['clean_value_1'] = [replace_missing_nces(val) for val in long_df.value]
    long_df['clean_value'] = pd.to_numeric(long_df.clean_value_1)
    long_df['which_var'] = long_df.variable.astype(str).str.replace("\\_20.*", "", regex = True)
    replace_pattern = "|".join(long_df.which_var.unique())
    long_df['which_year'] = [re.sub(replace_pattern, "", one_var) for one_var in long_df.variable]
    long_toagg = long_df[id_col + ['which_var', 'which_year',
                                           'clean_value']].reset_index()

    ## do the aggregation 
    index_cols = id_col + ["which_year"]
    df_wide = long_toagg.pivot_table(index  = index_cols,
                                             values = 'clean_value',
                                             columns = 'which_var').reset_index()

    ## do the division
    rate_varname = varname_clean + '_rate'
    df_wide[rate_varname] = df_wide[var_pattern]/df_wide[base_name_raw]
    #print(df_wide.head())
    
    ## return cleaned data
    return(df_wide[id_col + [rate_varname] + ['which_year']])
        



# 1. Load demographic data

Two sources right now:

- NCES Common Core --- SY 2012-2013 to SY 2017-2018
- DOE Civil Rights Data collection --- focusing on SY 2013

## 1.1 common core

In [2]:
## 75-col limit in export-- first 75 cols
cc_data_1 = pd.read_csv("../../raw_input/dc/dc_ccd.csv")

## 
cc_data_2 = pd.read_csv("../../raw_input/dc/dc_ccd_pull2.csv",
                       encoding= 'unicode_escape')

## find overlapping cols
cc_data_1_topull = cc_data_1.columns.difference(cc_data_2.columns).tolist() + \
                    ["School Name", 
                     "School ID - NCES Assigned [Public School] Latest available year"]
## merge excluding
cc_data_merged = pd.merge(cc_data_1[cc_data_1_topull], 
                          cc_data_2[[col for col in cc_data_2.columns if 
                            "School ID - NCES Assigned [Public School] Latest available year" not in col]], 
                          on = ["School Name"],
                         how = "left")

cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in cc_data_merged.columns]
cc_data_merged.columns = cc_cleancols


# 2. Clean nces common core data

In [3]:
enrollment_vars = [col for col in cc_data_merged if 'TOTAL_STUDENTS' in col]
base_name_raw = 'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___PUBLIC_SCHOOL_'

id_col = "SCHOOL_NAME"

    
    

In [4]:
frpl_rate = aggregate_nces(var_pattern = "FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL_",
                              varname_clean = "frpl_eligible",
                          id_col = ['SCHOOL_NAME'],
                          cc_data_merged = cc_data_merged,
                          enrollment_vars = enrollment_vars,
                          base_name_raw = base_name_raw)


In [5]:
race_enrollment_vars = [col for col in cc_data_merged.columns if "TOTAL_RACE" in col]
black_rate = aggregate_nces(var_pattern = "BLACK_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "black",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['SCHOOL_NAME'], cc_data_merged = cc_data_merged)
white_rate = aggregate_nces(var_pattern = "WHITE_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "white",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['SCHOOL_NAME'], cc_data_merged = cc_data_merged)
hisp_rate = aggregate_nces(var_pattern = "HISPANIC_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "hispanic",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                          id_col = ['SCHOOL_NAME'], cc_data_merged = cc_data_merged)

In [47]:
## merge into one df

dfs = [df.set_index(['SCHOOL_NAME', 
                     'which_year']) for df in [frpl_rate, black_rate, white_rate, hisp_rate]]

cc_dem_rates = pd.concat(dfs, axis=1).reset_index()
cc_dem_rates.rename(columns = {'SCHOOL_NAME': 
                      'nces_name'}, inplace = True)



## 3. Civil rights data collection


In [25]:
## read in iep data 
## rj note: currently reading from extract but later switch to api
crdc = pd.read_csv("../../raw_input/dc/EducationDataPortal_03.07.2020_disability.csv")
crdc.year.value_counts()

2015    660
2013    606
Name: year, dtype: int64

In [26]:
## aggregate- omitting ncessch because it's 
## dist id and not school id
iep_summary = aggregate_crdc(var_col = "disability", 
                            value_col = "enrollment_crdc", data = crdc,
                            id_col= ['school_name'])

In [None]:
## read and clean discipline data

In [27]:
crdc_largerpull = pd.read_csv("../../raw_input/dc/EducationDataPortal_03.08.2020_schools.csv")

## fill NA with 0
crdc_largerpull_fill = crdc_largerpull.fillna(0)


In [28]:
## discipline
discipline_cols = [col for col in crdc_largerpull_fill.columns if 
                  "susp" in col or "expulsions" in col or "corporal" in col]
restr_secl_cols = [col for col in crdc_largerpull_fill.columns if 
                  "restraint" in col or "seclusion" in col]

crdc_largerpull_fill['total_discipline'] = crdc_largerpull_fill[discipline_cols].sum(axis = 1)
crdc_largerpull_fill['total_restraint_seclude'] = crdc_largerpull_fill[restr_secl_cols].sum(axis = 1)

In [29]:
crdc_disc_foragg = crdc_largerpull_fill.loc[~crdc_largerpull_fill.enrollment.isin(['0',
                                        'Not applicable']),
                                        ['school_name', 'year',
                                        'enrollment',
                                        'total_discipline', 'ncessch']].copy()
crdc_disc_foragg.rename(columns = {'enrollment':
            'total'}, inplace = True)



In [41]:
disc_summary = aggregate_crdc(data = crdc_disc_foragg,
                             var_col = "total_discipline", 
                            value_col = "total_discipline",
                             format = "wide")
crdc_res_foragg = crdc_largerpull_fill.loc[~crdc_largerpull_fill.enrollment.isin(['0',
                                        'Not applicable']),
                                        ['school_name', 'year', "ncessch",
                                        'enrollment',
                                        'total_restraint_seclude']].copy()
crdc_res_foragg.rename(columns = {'enrollment':
            'total'}, inplace = True)

res_summary = aggregate_crdc(data = crdc_res_foragg,
                             var_col = "total_restraint_seclude", 
                            value_col = "total_restraint_seclude",
                             format = "wide")



### 3.3 merge the diff crdc data

In [None]:
res_summary.rename(columns = {'TOTAL': 'total_students_ressec_data'},
                  inplace = True)

disc_summary.rename(columns = {'TOTAL': 'total_students_disc_data'},
                  inplace = True)

iep_summary.rename(columns = {'TOTAL': 'total_students_iep_data'},
                  inplace = True)

res_exclude_year = [col for col in res_summary.columns if col != "YEAR"]
disc_exclude_year = [col for col in disc_summary.columns if col != "YEAR"]


In [None]:
## merge restraint with iep
iep_res = pd.merge(iep_summary, res_summary, how = "left",
                  on = "SCHOOL_NAME",
                  indicator = "merge_ieprestraints")
dfs_crdc = pd.merge(iep_res, disc_summary[[col for col in disc_summary.columns if 
                                           col not in ['YEAR', 'NCESSCH']]],
                   how = 'left',
                   on = 'SCHOOL_NAME',
                   indicator = "merge_iepdisc",
                   suffixes = ["_iepres", "_discipline"])


In [None]:
## add race from crdc using urban institute API
from urllib.request import urlopen
from json import loads
url = "https://educationdata.urban.org/api/v1/schools/crdc/enrollment/2013/race/sex/"
response = urlopen(url)
data = loads(response.read())


In [None]:
response

# 4. Write the results to merge with the filings data



In [48]:
dfs_crdc.to_csv("../../intermediate_objects/cleaned_df/dc_crdc_2013.csv", index = False)
dfs_crdc.to_pickle("../../intermediate_objects/cleaned_df/dc_crdc_2013.pkl")

cc_dem_rates.to_csv("../../intermediate_objects/cleaned_df/dc_ccd_2013.csv", index = False)
cc_dem_rates.to_pickle("../../intermediate_objects/cleaned_df/dc_ccd_2013.pkl")