# Clean and link filings



## 0. Imports and functions

In [1]:

import pandas as pd
import pickle
import re
import numpy as np
import plotnine
from plotnine import *


pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 1. Load demographic data


In [2]:
df_crdc = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_crdc_2013.pkl")
df_cc = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_ccd_2013.pkl")


df_crdc['ncesid_clean'] = df_crdc.NCESSCH.astype(str).str.replace("\\.", "", regex = True)

## 2. Load filings data

In [3]:
filings = pd.read_pickle("../../intermediate_objects/cleaned_df/dc_filings.pkl")


In [5]:
## first try merging between the nces school-level id and crdc

filings['ncesid_clean'] = np.where(filings.ncessch.notnull(),
                                   filings.ncessch.astype(str).str.replace("\\.", "", regex = True),
                                   np.nan)


## aggregate by non-missing ncesid
filings_agg = filings[filings.ncesid_clean.notnull()].groupby(['ncesid_clean']).agg({'case_no': 
                                                'nunique'}).reset_index()

## left join onto crdc
filings_agg_wcrdc = pd.merge(df_crdc,
                            filings_agg,
                            on = 'ncesid_clean',
                            how = 'left',
                            indicator = 'crdc_status')


filings_agg_complete = filings_agg_wcrdc[(filings_agg_wcrdc.STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA.notnull()) &
                                        (filings_agg_wcrdc.STUDENTS_WITH_DISABILITIES_SERVED_UNDER_IDEA_rate < 0.5)].copy()


filings_agg_complete['case_status'] = np.where(filings_agg_complete.crdc_status == "both",
                                              "Any filings",
                                              "No filings")

## write csv for R
filings_agg_complete.to_csv("../../intermediate_objects/cleaned_df/dc_filings_wcrdc.csv",
                           index = False)

In [9]:
## create non-aggregated version at filings level

filings_indiv_wcrdc = pd.merge(filings,
                            df_crdc,
                            on = 'ncesid_clean',
                            how = 'left',
                            indicator = 'crdc_status')
filings_indiv_wcrdc.to_csv("../../intermediate_objects/cleaned_df/dc_filings_wcrdc_nonagg.csv",
                           index = False)

In [None]:
## where things left off:
## - fuzzy matching nces and ccd
## next steps:
## - load in matched data
## - get better ids
## - do left join on nces and hopefully more schools also have ccd data