# Clean and link filings



## 0. Imports and functions

In [22]:

import pandas as pd
import pickle
import re
import numpy as np
import plotnine
from plotnine import *


pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 1. Load demographic data


In [23]:
df_crdc = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_crdc_2013.pkl")
df_cc = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_ccd_2013.pkl")

df_cc.head()
df_crdc.head()

df_crdc['ncesid_clean'] = df_crdc.NCESSCH.astype(str).str.replace("\\.", "", regex = True)

which_var,nces_name,which_year,frpl_eligible_rate,black_rate,white_rate,hispanic_rate
0,ACADEMY OF HOPE ADULT PCS,_2014-15,,,,
1,ACADEMY OF HOPE ADULT PCS,_2015-16,,,,
2,ACHIEVEMENT PREPARATORY PCS ELEMENTARY,_2013-14,0.987,1.0,0.0,0.0
3,ACHIEVEMENT PREPARATORY PCS ELEMENTARY,_2014-15,0.625,0.996,0.004,0.0
4,ACHIEVEMENT PREPARATORY PCS ELEMENTARY,_2015-16,0.538,0.978,0.0,0.022


Unnamed: 0,SCHOOL_NAME,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_SECTION_504,total_students_iep_data,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_SECTION_504_rate,STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA_rate,YEAR,NCESSCH,total_students_ressec_data,TOTAL_RESTRAINT_SECLUDE,TOTAL_RESTRAINT_SECLUDE_rate,merge_ieprestraints,total_students_disc_data,TOTAL_DISCIPLINE,TOTAL_DISCIPLINE_rate,merge_iepdisc
0,ACHIEVEMENT PREPARATORY ACADEMY PCS,,7.0,355.0,0.02,,2013.0,110007200405.0,382.0,0.0,0.0,both,382.0,205.0,0.537,both
1,ACHIEVEMENT PREPARATORY PCS-ELEMENTARY,,2.0,222.0,0.009,,2013.0,110007200473.0,233.0,0.0,0.0,both,233.0,20.0,0.086,both
2,AITON ES,26.0,4.0,249.0,0.016,0.104,2013.0,110003000122.0,247.0,0.0,0.0,both,247.0,57.0,0.231,both
3,AMIDON BOWEN ES,50.0,2.0,341.0,0.006,0.147,2013.0,110003000121.0,342.0,0.0,0.0,both,342.0,69.0,0.202,both
4,ANACOSTIA SHS,224.0,7.0,766.0,0.009,0.292,2013.0,110003000085.0,751.0,0.0,0.0,both,751.0,338.0,0.45,both


## 2. Load filings data

In [50]:
filings = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_filings.pkl")
filings.type_closingorder.value_counts(normalize = True)

filings.

Order   0.635
HOD     0.365
Name: type_closingorder, dtype: float64

In [46]:
## first try merging between the nces school-level id and crdc

filings['ncesid_clean'] = np.where(filings.ncessch.notnull(),
                                   filings.ncessch.astype(str).str.replace("\\.", "", regex = True),
                                   np.nan)


## aggregate by non-missing ncesid
filings_agg = filings[filings.ncesid_clean.notnull()].groupby(['ncesid_clean']).agg({'case_no': 'nunique'}).reset_index()

## left join onto crdc
filings_agg_wcrdc = pd.merge(df_crdc,
                            filings_agg,
                            on = 'ncesid_clean',
                            how = 'left',
                            indicator = 'crdc_status')


filings_agg_complete = filings_agg_wcrdc[(filings_agg_wcrdc.STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA.notnull()) &
                                        (filings_agg_wcrdc.STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA_rate < 0.5)].copy()


filings_agg_complete['case_status'] = np.where(filings_agg_complete.crdc_status == "both",
                                              "Any filings",
                                              "No filings")

## write csv for R
filings_agg_complete.to_csv("../../intermediate_objects/cleaned_df/dc_filings_wcrdc.csv",
                           index = False)

In [None]:
## where things left off:
## - fuzzy matching nces and ccd
## next steps:
## - load in matched data
## - get better ids
## - do left join on nces and hopefully more schools also have ccd data