## 0. Imports and functions

In [2]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time

## plotting
from plotnine import *

## dates
from dateutil.relativedelta import relativedelta



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    
    return(clean_lea)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



def replace_missing_nces(one_val):
    
    if one_val.isdigit():
        final_val = one_val
    else:
        final_val = np.nan
    return(final_val)

def aggregate_crdc(var_col, value_col, 
                  data,
                  id_col = "school_name",
                  year_chosen = 2013, format = "long"):
    
    
    ## shape from long to wide
    if(format == "long"):
        df_wide = pd.pivot_table(data.loc[data.year == year_chosen,
                    [id_col,
                    var_col, 
                    value_col]],
                    index  = id_col, 
                    values = value_col,
                    columns = var_col).reset_index()

    
    else:
        df_wide = data.loc[data.year == year_chosen].copy()
        
    ## standardize columns
    df_wide.columns = [re.sub("\s+", "_", col.upper()) 
                           for col in df_wide.columns]
        
    ## generate rates
    sub_cols = set(df_wide.columns).difference(["SCHOOL_NAME", "TOTAL", 'YEAR'])
    
    ## 
    for col in sub_cols:
        df_wide[col] = pd.to_numeric(df_wide[col])
        df_wide['TOTAL'] = pd.to_numeric(df_wide['TOTAL'])
        df_wide['{}_rate'.format(col)] = df_wide[col]/df_wide['TOTAL']
    
    ## return
    return(df_wide)
    
def aggregate_nces(var_pattern, varname_clean, id_col, 
                       cc_data_merged,
                      enrollment_vars, base_name_raw):
    
    dem_vars = [col for col in cc_data_merged if var_pattern in col]
    long_df = pd.melt(cc_data_merged[dem_vars + enrollment_vars + id_col],
                       id_vars = id_col)
    
    ## create year versus dem col
    long_df['clean_value_1'] = [replace_missing_nces(val) for val in long_df.value]
    long_df['clean_value'] = pd.to_numeric(long_df.clean_value_1)
    long_df['which_var'] = long_df.variable.astype(str).str.replace("\\_20.*", "")
    replace_pattern = "|".join(long_df.which_var.unique())
    long_df['which_year'] = [re.sub(replace_pattern, "", one_var) for one_var in long_df.variable]
    long_toagg = long_df[id_col + ['which_var', 'which_year',
                                           'clean_value']].reset_index()

    ## do the aggregation 
    index_cols = id_col + ["which_year"]
    df_wide = long_toagg.pivot_table(index  = index_cols,
                                             values = 'clean_value',
                                             columns = 'which_var').reset_index()

    ## do the division
    rate_varname = varname_clean + '_rate'
    df_wide[rate_varname] = df_wide[var_pattern]/df_wide[base_name_raw]
    #print(df_wide.head())
    
    ## return cleaned data
    return(df_wide[id_col + [rate_varname] + ['which_year']])


def clean_next_row(one_row):
    
    ## convert to correct type
    if type(one_row) != str:
        clean_row = str(one_row)
    else:
        clean_row = one_row
        
    ## extract correct pattern    
    if bool(re.search(r'^-', str(clean_row))) == True:
        isd = re.sub(r'^-', '', str(clean_row))
    
    elif bool(re.search(r'\(', clean_row)) == True:
        isd = re.sub(r'.*\(([0-9]+)\).*', r'\1', clean_row)
        
    else:
        isd = np.nan
        
    ## pad 0's
    if type(isd) == str and len(isd) == 5:
        isd = "0" + isd
        
    return(isd)
        

  from pandas.core import datetools


In [3]:
## constants
base_path = "/Users/raj2/Dropbox/dph_hearing_decisions/"

## 1. Load and do prelim cleaning of filings data

In [4]:
texas_filings_init = pd.read_excel(base_path + "data/texas/raw_filings/002_dph_20052019_ocr.xlsx")


## 1.1 Clean years

In [5]:
## combine 2nd and 3rd row into columns
name_cols_init = ["{}_{}".format(b_, a_) for a_, b_ in zip(texas_filings_init.iloc[2, ], 
                                                     texas_filings_init.iloc[3, ])]

name_cols = [re.sub("\s+", "_", col.lower()) for col in name_cols_init]

## create df and rename cols
##
texas_filings_init_df = texas_filings_init.iloc[5:, ].copy()
texas_filings_init_df.columns = name_cols
texas_filings_init_df.head()

## see from pdf that blank rows
## are ones where hearing officer splits onto
## second line
## remove ones with nan for decision id
non_ids = "|".join(["ID", "Decision", "Page",
                   "DISMISSED", "GRANTED", "DENIED", "AM", "PM",
                   "PENDING"])
texas_filings_real = texas_filings_init_df.loc[(texas_filings_init_df.nan_decision_id.notnull()) &
                                (~texas_filings_init_df.nan_decision_id.astype(str).str.contains(non_ids)),
                                              ].copy()

"""
Removing the blank rows takes the data from {} rows to {} rows.
""".format(texas_filings_init_df.shape[0],
          texas_filings_real.shape[0])


## see that years are 100 below but dates are correct
## so just extract year
year_request = [str(one_request.year + 100) for one_request in pd.to_datetime(texas_filings_real.reauest_date_o,
                            format = "%Y-%m-%d 00:00:00", errors = "coerce")]

texas_filings_real['year_request'] = year_request

Unnamed: 0,nan_decision_id,nan_docket_#,reauest_date_o,due_date_decision,date_last_order,date_hearina,nan_lea_student,nan_hearina_officer,(adv./tot.)_issues,nan_nan
5,167.0,057-S E-1105,1905-11-16 00:00:00,1906-01-30 00:00:00,1906-03-06 00:00:00,,EL PASO ISD (071902),LARRY CRADDOCK,0/0,
6,315.0,132-S E-0206,1906-02-22 00:00:00,1906-05-08 00:00:00,1906-03-06 00:00:00,1906-04-20 00:00:00,EL PASO ISD (071902),LARRY CRADDOCK,0/0,
7,325.0,137-S E-0206,1906-02-27 00:00:00,1906-05-13 00:00:00,1906-02-27 00:00:00,1906-07-17 00:00:00,CLEAR CREEK ISD (084910),DEBORAH,0/0,
8,,,,,,,,MCELVANEY,,
9,691.0,273-S E-0806,1906-08-24 00:00:00,1906-11-07 00:00:00,1907-01-29 00:00:00,1906-12-15 00:00:00,RICHARDSON ISD (057916),STEVEN ALEMAN,0/0,


'\nRemoving the blank rows takes the data from 6093 rows to 4026 rows.\n'

## 1.2 Extract state-level school identifiers

In [6]:
isds = [re.sub(r'.*\(([0-9]+)\).*', r'\1', one_string) if type(one_string) == str
        else np.nan
        for one_string in texas_filings_real.nan_lea_student]

texas_filings_real['isd_init'] = isds
texas_filings_real['failed_extract_isd'] = np.where(texas_filings_real.isd_init.astype(str).str.len() != 6, 
                                                    1, 0)

In [7]:
## separate into ones with isd, ones without
texas_wisd = texas_filings_real.loc[texas_filings_real.failed_extract_isd == 0].copy()
texas_noisd = texas_filings_real.loc[texas_filings_real.failed_extract_isd == 1].copy()


## 1.2.1 Issue one: present but in earlier col

In [8]:
## for those with no isd, try to extract from earlier col
isds_earlier = [re.sub(r'.*\(([0-9]+)\).*', r'\1', one_string) if type(one_string) == str
        else np.nan
        for one_string in texas_noisd.date_hearina]

texas_noisd_new = texas_noisd.drop(columns = ['isd_init', 'failed_extract_isd'], inplace = False)
texas_noisd_new['isd_init'] = isds_earlier
texas_noisd_new['failed_extract_isd'] = np.where(texas_noisd_new.isd_init.astype(str).str.len() != 6, 
                                                    1, 0)


In [9]:
texas_round2 = pd.concat([texas_wisd, texas_noisd_new], axis = 0)
texas_round2.failed_extract_isd.value_counts()


0    3832
1     194
Name: failed_extract_isd, dtype: int64

### 1.2.2 Issue two-- moved on to next row because name too long

In [10]:
## get decision ids of the ones still missing
decision_ids_missisd = texas_round2.nan_decision_id[texas_round2.failed_extract_isd == 1].copy()

## in original data, get row indices of those ids
rows_missisd = texas_filings_init_df[texas_filings_init_df.nan_decision_id.isin(decision_ids_missisd)].index.tolist()
nextrow_missisd = [row_num + 1 for row_num in rows_missisd]

df_nextrow_missisd = texas_filings_init_df.loc[texas_filings_init_df.index.isin(nextrow_missisd), ].copy()


In [11]:
nextrow_isds = [clean_next_row(one_row) for one_row in df_nextrow_missisd.nan_lea_student]

## add to original
texas_round2_noisd = texas_round2[texas_round2.failed_extract_isd == 1].copy().drop(columns = ["isd_init",
                                                                                              'failed_extract_isd'])
texas_round2_noisd['isd_init'] = nextrow_isds
texas_noisd_new['failed_extract_isd'] = np.where(texas_noisd_new.isd_init.astype(str).str.len() != 6, 
                                                    1, 0)

## rowbind
texas_round3 = pd.concat([texas_round2[texas_round2.failed_extract_isd == 0].copy(),
                         texas_round2_noisd], axis = 0)


"""There are {} unique isds in filings data.
""".format(len(texas_round3.isd_init.unique()))



'There are 497 unique isds in filings data.\n'

# 2. Merge with nces crosswalk'

- Next steps-- look at overlap

- Troubleshoot non-overlap

- Merge with crosswalk then with nces district-level demographics 

In [13]:
os.getcwd()

'/Users/raj2/Dropbox/rights_towhat_chapter/code/texas_filings'

In [14]:


## load nces data for texas 
## and before working on further, 
## see if the ids are useful
## before cleaning further
id_crosswalk = pd.read_csv(base_path + "data/texas/intermediate/texas_distid_nces_crosswalk.csv")
id_crosswalk['statelevel_id'] = [str(re.sub("-", "", one_id)) for one_id in id_crosswalk.DISTRICT]
id_crosswalk_relcols = id_crosswalk[['statelevel_id', 'NCES_DISTR']].copy()

## 
texas_ids_unique = id_crosswalk_relcols.statelevel_id.unique()
filings_ids_unique = set(texas_round3.isd_init)


isd_intersect = set(texas_ids_unique).intersection(filings_ids_unique)
"""Out of {} filings, {}, or {} proportion, can be matched.
""".format(len(filings_ids_unique),
          len(isd_intersect),
          len(isd_intersect)/len(filings_ids_unique))


nonmatched_isds = filings_ids_unique.difference(texas_ids_unique)
len(nonmatched_isds)


## pull up names of nonmatched ones
texas_round3['matched_filing'] = np.where(texas_round3.isd_init.isin(isd_intersect), 1, 0)

## later, load nces data and can 
## see if can get a close match based on name

## merge nces id's on
texas_filings_wnces = pd.merge(texas_round3, 
                              id_crosswalk,
                              left_on = "isd_init",
                              right_on = "statelevel_id",
                              how = "left")

texas_filings_wnces.head()

texas_round3.matched_filing.value_counts()
texas_nonmatch = texas_round3.loc[texas_round3.matched_filing == 0, ].copy()
print(pd.DataFrame({'school': texas_nonmatch.nan_lea_student.unique()}).to_latex(index= False))


texas_filings_wnces.to_csv("/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/intermediate/texas_filings_wnces.csv",
                          index = False)

'Out of 497 filings, 435, or 0.8752515090543259 proportion, can be matched.\n'

62

Unnamed: 0,(adv./tot.)_issues,date_hearina,date_last_order,due_date_decision,failed_extract_isd,isd_init,nan_decision_id,nan_docket_#,nan_hearina_officer,nan_lea_student,...,DISTRICT_N,DISTRICT,DISTRICT_C,NCES_DISTR,COLOR,Shape_area,Shape_len,Shape__Area,Shape__Length,statelevel_id
0,0/0,,1906-03-06 00:00:00,1906-01-30 00:00:00,0.0,71902,167,057-S E-1105,LARRY CRADDOCK,EL PASO ISD (071902),...,71902.0,071-902,71902.0,4818300.0,5.0,547564575.23,149222.912,764604626.281,176412.992,71902
1,0/0,1906-04-20 00:00:00,1906-03-06 00:00:00,1906-05-08 00:00:00,0.0,71902,315,132-S E-0206,LARRY CRADDOCK,EL PASO ISD (071902),...,71902.0,071-902,71902.0,4818300.0,5.0,547564575.23,149222.912,764604626.281,176412.992,71902
2,0/0,1906-07-17 00:00:00,1906-02-27 00:00:00,1906-05-13 00:00:00,0.0,84910,325,137-S E-0206,DEBORAH,CLEAR CREEK ISD (084910),...,84910.0,084-910,84910.0,4814280.0,1.0,272297165.027,123475.862,362285390.094,142377.921,84910
3,0/0,1906-12-15 00:00:00,1907-01-29 00:00:00,1906-11-07 00:00:00,0.0,57916,691,273-S E-0806,STEVEN ALEMAN,RICHARDSON ISD (057916),...,57916.0,057-916,57916.0,4837020.0,4.0,119322878.084,62431.436,170436025.863,74596.574,57916
4,0/0,,1906-09-22 00:00:00,1906-11-25 00:00:00,0.0,57916,1376,006-S E-0906,STEVEN ALEMAN,RICHARDSON ISD (057916),...,57916.0,057-916,57916.0,4837020.0,4.0,119322878.084,62431.436,170436025.863,74596.574,57916


1    3863
0     163
Name: matched_filing, dtype: int64

\begin{tabular}{l}
\toprule
                            school \\
\midrule
 TEXAS SCH FOR TH E D EAF (227906) \\
         UPLIFT EDUCATION (057803) \\
      STAR CHARTER SCHOOL (227814) \\
             IDEA ACADEMY (108807) \\
         NORTH FOREST ISD (101909) \\
         SOUTHWEST SCHOOL (101838) \\
           LA MARQU E ISD (084904) \\
         VANGUARD ACADEMY (108808) \\
      NYOS CHARTER SCHOOL (227804) \\
          SOUTH TEXAS ISD (031916) \\
            ARROW ACADEMY (021805) \\
      IDEA PUBLIC SCHOOLS (108807) \\
         KIPP SAN ANTONIO (015826) \\
         WAYSID E SCHOOLS (227803) \\
                     STEVEN ALEMAN \\
                     LYNN RUBINETT \\
                     SHERRY WETSCH \\
                  SHARON CLONINGER \\
                        ANN VEVIER \\
                      DAVID BERGER \\
                     LUCIUS BUNTON \\
                        KASEY WHIT \\
                     KATHRYN LEWIS \\
                     TOMMY BROYLES \\
          HAR

# 3. Clean nces demographics

### 3.1 Read in and clean colnames

In [105]:
## 75-col limit in export-- first 75 cols
cc_data_1 = pd.read_csv(base_path + "data/texas/intermediate/texas_ccd_pull1.csv")
cc_data_2 = pd.read_csv(base_path + "data/texas/intermediate/texas_ccd_pull2.csv")
cc_data_3 = pd.read_csv(base_path + "data/texas/intermediate/texas_ccd_pull3.csv")
cc_data_4 = pd.read_csv(base_path + "data/texas/intermediate/texas_alternatedem.csv")



## merge based on agency id
cc_data_merge_init = pd.merge(cc_data_1, 
                    cc_data_2.drop(columns = ['Agency Name', 
                                              'State Name [District] Latest available year']),
                    on = 'Agency ID - NCES Assigned [District] Latest available year')

cc_data_merge_2 = pd.merge(cc_data_merge_init, 
                    cc_data_3.drop(columns = ['Agency Name', 
                                              'State Name [District] Latest available year']),
                    on = 'Agency ID - NCES Assigned [District] Latest available year')

cc_data_merged = pd.merge(cc_data_merge_2,
                    cc_data_4.drop(columns = ['Agency Name', 
                                              'State Name [District] Latest available year']),
                    on = 'Agency ID - NCES Assigned [District] Latest available year')



Unnamed: 0,Agency Name,State Name [District] Latest available year,Agency ID - NCES Assigned [District] Latest available year,Agency Name [District] 2017-18,Agency Name [District] 2016-17,Agency Name [District] 2015-16,Agency Name [District] 2014-15,Agency Name [District] 2013-14,Agency Name [District] 2012-13,Agency Name [District] 2011-12,...,Total Race/Ethnicity [Public School] 2014-15,Total Race/Ethnicity [Public School] 2013-14,Total Race/Ethnicity [Public School] 2012-13,Total Race/Ethnicity [Public School] 2011-12,Total Race/Ethnicity [Public School] 2010-11,Total Race/Ethnicity [Public School] 2009-10,Total Race/Ethnicity [Public School] 2008-09,Total Race/Ethnicity [Public School] 2007-08,Total Race/Ethnicity [Public School] 2006-07,Total Race/Ethnicity [Public School] 2005-06
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095,A W BROWN LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,A W BROWN-FELLOWSHIP LEADERSHIP ACADEMY,...,2152,1991,520,1570,1395,1352,1223,1170,1104,1031
1,A+ ACADEMY,Texas,4800203,A+ ACADEMY,A+ ACADEMY,A+ ACADEMY,A+ ACADEMY,A+ ACADEMY,A+ ACADEMY,A+ ACADEMY,...,975,995,988,951,1076,1033,985,957,994,961
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453,A+ UNLIMITED POTENTIAL,A+ UNLIMITED POTENTIAL,A+ UNLIMITED POTENTIAL,†,†,†,†,...,†,†,†,†,†,†,†,†,†,†
3,ABBOTT ISD,Texas,4807380,ABBOTT ISD,ABBOTT ISD,ABBOTT ISD,ABBOTT ISD,ABBOTT ISD,ABBOTT ISD,ABBOTT ISD,...,264,269,276,296,297,297,306,304,315,302
4,ABERNATHY ISD,Texas,4807410,ABERNATHY ISD,ABERNATHY ISD,ABERNATHY ISD,ABERNATHY ISD,ABERNATHY ISD,ABERNATHY ISD,ABERNATHY ISD,...,779,768,767,767,785,809,769,785,794,795


In [106]:
cc_cleancols = [re.sub("\\s+|\\[|\\]|\\(|\\)", "_", x).upper() for x in cc_data_merged.columns]
cc_data_merged.columns = cc_cleancols



In [107]:


## make sure that ids overlap
nces_ids = cc_merged['AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR'].unique().tolist()
intersect_nces_filings = set(nces_ids).intersection(texas_filings_wnces.NCES_DISTR)

"""
There are {} unique nces ids in the filings. {}, or {} proportion, overlap with nces
""".format(len(texas_filings_wnces.NCES_DISTR.unique()),
          len(intersect_nces_filings),
          len(intersect_nces_filings)/len(texas_filings_wnces.NCES_DISTR.unique()))


'\nThere are 436 unique nces ids in the filings. 435, or 0.9977064220183486 proportion, overlap with nces\n'

### 3.2: calculate enrollment percentages

In [109]:
## rename id col
cc_data_merged.rename(columns = {'AGENCY_ID_-_NCES_ASSIGNED__DISTRICT__LATEST_AVAILABLE_YEAR':
                                'nces_id'}, inplace = True)


In [110]:

enrollment_vars = [col for col in cc_data_merged if 'TOTAL_STUDENTS' in col]
base_name_raw = 'TOTAL_STUDENTS_ALL_GRADES__EXCLUDES_AE___DISTRICT_'



In [111]:
frpl_rate = aggregate_nces(var_pattern = "FREE_LUNCH_ELIGIBLE__PUBLIC_SCHOOL_",
                              varname_clean = "frpl_eligible",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged,
                          enrollment_vars = enrollment_vars,
                          base_name_raw = base_name_raw)


In [112]:
race_enrollment_vars = [col for col in cc_data_merged.columns if "TOTAL_RACE" in col]
race_enrollment_vars

cc_data_merged[[col for col in cc_data_merged.columns if "BLACK" in col]].head()

['TOTAL_RACE/ETHNICITY__DISTRICT__2017-18',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2016-17',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2015-16',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2014-15',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2013-14',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2012-13',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2011-12',
 'TOTAL_RACE/ETHNICITY__DISTRICT__2010-11',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2017-18',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2016-17',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2015-16',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2014-15',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2013-14',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2012-13',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2011-12',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2010-11',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2009-10',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2008-09',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2007-08',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2006-07',
 'TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL__2005-06']

Unnamed: 0,BLACK_STUDENTS__DISTRICT__2017-18,BLACK_STUDENTS__DISTRICT__2016-17,BLACK_STUDENTS__DISTRICT__2015-16,BLACK_STUDENTS__DISTRICT__2014-15,BLACK_STUDENTS__DISTRICT__2013-14,BLACK_STUDENTS__DISTRICT__2012-13,BLACK_STUDENTS__DISTRICT__2011-12,BLACK_STUDENTS__DISTRICT__2010-11,BLACK_STUDENTS__PUBLIC_SCHOOL__2017-18,BLACK_STUDENTS__PUBLIC_SCHOOL__2016-17,...,BLACK_STUDENTS__PUBLIC_SCHOOL__2014-15,BLACK_STUDENTS__PUBLIC_SCHOOL__2013-14,BLACK_STUDENTS__PUBLIC_SCHOOL__2012-13,BLACK_STUDENTS__PUBLIC_SCHOOL__2011-12,BLACK_STUDENTS__PUBLIC_SCHOOL__2010-11,BLACK_STUDENTS__PUBLIC_SCHOOL__2009-10,BLACK_STUDENTS__PUBLIC_SCHOOL__2008-09,BLACK_STUDENTS__PUBLIC_SCHOOL__2007-08,BLACK_STUDENTS__PUBLIC_SCHOOL__2006-07,BLACK_STUDENTS__PUBLIC_SCHOOL__2005-06
0,2044,2183,2076,2090,1946,†,1544,1374,2044,2183,...,2090,1946,511,1544,1374,1332,1200,1146,1078,997
1,48,34,30,33,35,37,45,65,48,34,...,33,35,37,45,65,92,99,88,96,112
2,88,50,†,†,†,†,†,†,88,50,...,†,†,†,†,†,†,†,†,†,†
3,1,2,0,0,1,1,1,2,1,2,...,0,1,1,1,2,6,7,8,12,8
4,7,8,5,5,9,8,13,13,7,8,...,5,9,8,13,13,12,15,21,15,16


In [90]:
cc_data_merged.shape

(1328, 145)

In [113]:
black_rate = aggregate_nces(var_pattern = "BLACK_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "black",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['nces_id'],
                           cc_data_merged = cc_data_merged)
white_rate = aggregate_nces(var_pattern = "WHITE_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "white",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                           id_col = ['nces_id'],
                           cc_data_merged = cc_data_merged)
hisp_rate = aggregate_nces(var_pattern = "HISPANIC_STUDENTS__PUBLIC_SCHOOL_",
                              varname_clean = "hispanic",
                               enrollment_vars = race_enrollment_vars,
                               base_name_raw = "TOTAL_RACE/ETHNICITY__PUBLIC_SCHOOL_",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged)

In [118]:
[col for col in cc_data_merged.columns if "INDIVIDUALIZED" in col]

['INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2017-18',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2016-17',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2015-16',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2014-15',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2013-14',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2012-13',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2011-12',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2010-11',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2009-10',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2008-09',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2007-08',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2006-07',
 'INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT__2005-06']

In [119]:
iep_rate = aggregate_nces(var_pattern = "INDIVIDUALIZED_EDUCATION_PROGRAM_STUDENTS__DISTRICT_",
                              varname_clean = "iep",
                          id_col = ['nces_id'],
                          cc_data_merged = cc_data_merged,
                          enrollment_vars = enrollment_vars,
                          base_name_raw = base_name_raw)

In [122]:
## merge into one df

dfs = [df.set_index(['nces_id', 
                     'which_year']) for df in [frpl_rate, black_rate, white_rate, hisp_rate,
                                              iep_rate]]

cc_dem_rates = pd.concat(dfs, axis=1).reset_index()



## use demographics at baseline-ish
## maybe exclude 2012



which_var,nces_id,which_year,frpl_eligible_rate,black_rate,white_rate,hispanic_rate,iep_rate
17210,4846680,_2016-17,0.714,,,,0.122
17223,4846710,_2016-17,0.838,,,,0.097
17236,4846740,_2016-17,0.528,,,,0.142
17249,4846770,_2016-17,0.523,,,,0.077
17262,4899130,_2016-17,0.7,,,,0.11


## 4. Aggregate and merge with complaints data

Count of complaints 2014 onwards --- 2013-2014 demographics

In [126]:
texas_filings_wnces.head()

Unnamed: 0,(adv./tot.)_issues,date_hearina,date_last_order,due_date_decision,failed_extract_isd,isd_init,nan_decision_id,nan_docket_#,nan_hearina_officer,nan_lea_student,...,DISTRICT_N,DISTRICT,DISTRICT_C,NCES_DISTR,COLOR,Shape_area,Shape_len,Shape__Area,Shape__Length,statelevel_id
0,0/0,,1906-03-06 00:00:00,1906-01-30 00:00:00,0.0,71902,167,057-S E-1105,LARRY CRADDOCK,EL PASO ISD (071902),...,71902.0,071-902,71902.0,4818300.0,5.0,547564575.23,149222.912,764604626.281,176412.992,71902
1,0/0,1906-04-20 00:00:00,1906-03-06 00:00:00,1906-05-08 00:00:00,0.0,71902,315,132-S E-0206,LARRY CRADDOCK,EL PASO ISD (071902),...,71902.0,071-902,71902.0,4818300.0,5.0,547564575.23,149222.912,764604626.281,176412.992,71902
2,0/0,1906-07-17 00:00:00,1906-02-27 00:00:00,1906-05-13 00:00:00,0.0,84910,325,137-S E-0206,DEBORAH,CLEAR CREEK ISD (084910),...,84910.0,084-910,84910.0,4814280.0,1.0,272297165.027,123475.862,362285390.094,142377.921,84910
3,0/0,1906-12-15 00:00:00,1907-01-29 00:00:00,1906-11-07 00:00:00,0.0,57916,691,273-S E-0806,STEVEN ALEMAN,RICHARDSON ISD (057916),...,57916.0,057-916,57916.0,4837020.0,4.0,119322878.084,62431.436,170436025.863,74596.574,57916
4,0/0,,1906-09-22 00:00:00,1906-11-25 00:00:00,0.0,57916,1376,006-S E-0906,STEVEN ALEMAN,RICHARDSON ISD (057916),...,57916.0,057-916,57916.0,4837020.0,4.0,119322878.084,62431.436,170436025.863,74596.574,57916


In [131]:
years_agg = ["2014", "2015", "2016", "2017", "2018"]
texas_filings_agg = texas_filings_wnces.loc[texas_filings_wnces.year_request.isin(years_agg)].groupby(['NCES_DISTR',
                                                                    'year_request']).agg({'nan_decision_id': 
                                           lambda x: x.nunique()}).reset_index()




texas_filings_agg.columns = ['nces_id', 'year', 'count_filings']


In [133]:
cc_dem_rates_20132014 = cc_dem_rates.loc[cc_dem_rates.which_year == "_2013-14"].copy()


## reshape filings to wide
texas_filings_postdem_wide = pd.pivot_table(texas_filings_agg,
                                        index = ['nces_id'],
                                        columns = ['year'],
                                        values = 'count_filings').reset_index()


texas_filings_postdem_wide.columns = ["total_filings_" + str(col) if col in years_agg 
                                     else col for col in texas_filings_postdem_wide.columns]
texas_filings_postdem_wide.head()

Unnamed: 0,nces_id,total_filings_2014,total_filings_2015,total_filings_2016,total_filings_2017,total_filings_2018
0,4800010.0,,,3.0,1.0,2.0
1,4807440.0,,,1.0,,
2,4807590.0,1.0,2.0,1.0,,
3,4807710.0,1.0,,,,
4,4807830.0,4.0,1.0,1.0,1.0,


In [135]:
## fill na with 0 since that indicates no filings that year
texas_filings_postdem_wide = texas_filings_postdem_wide.fillna(0)

In [139]:
## left join with common core demographics
cc_dem_rates_wcase = pd.merge(cc_dem_rates_20132014,
                             texas_filings_postdem_wide,
                             on = "nces_id",
                             how = "left")


## now fill na's for those cols with 0
filings_cols = [col for col in cc_dem_rates_wcase.columns
                if "filings" in col]
cc_dem_rates_wcase[filings_cols] = cc_dem_rates_wcase[filings_cols].fillna(0)
cc_dem_rates_wcase.head()


Unnamed: 0,nces_id,which_year,frpl_eligible_rate,black_rate,white_rate,hispanic_rate,iep_rate,total_filings_2014,total_filings_2015,total_filings_2016,total_filings_2017,total_filings_2018
0,4800001,_2013-14,0.68,0.044,0.237,0.708,0.118,0.0,0.0,0.0,0.0,0.0
1,4800002,_2013-14,0.416,0.024,0.512,0.444,0.102,0.0,0.0,0.0,0.0,0.0
2,4800003,_2013-14,0.576,0.0,0.174,0.826,0.091,0.0,0.0,0.0,0.0,0.0
3,4800004,_2013-14,0.635,0.026,0.075,0.889,0.169,0.0,0.0,0.0,0.0,0.0
4,4800005,_2013-14,0.308,0.0,0.839,0.147,0.126,0.0,0.0,0.0,0.0,0.0


In [141]:
## write to csv
cc_dem_rates_wcase.to_csv(base_path + "data/texas/cleaned/filings_withdem.csv",
                  index = False)