In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
#from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = "C:/Users/sronkowski/Documents/GitHub/CPS_Report_Card/"
#PATH = "/home/steve/Code/CPS_Report_Card/"

# Step 1 - Data Cleaning and Preprocessing
To begin the anaysis, I will load four years worth of Report Card data into seperate DataFrames.

In [4]:
df_201516 = pd.read_csv(f'{PATH}Chicago_Public_Schools_-_School_Progress_Reports_SY1516.csv', 
                     low_memory=False, na_values = ["NOT ENOUGH DATA", "NOT APPLICABLE", "INCOMPLETE DATA"])
df_201516.shape

(670, 153)

In [5]:
df_201617 = pd.read_csv(f'{PATH}Chicago_Public_Schools_-_School_Progress_Reports_SY1617.csv', 
                     low_memory=False, na_values = ["NOT ENOUGH DATA", "NOT APPLICABLE", "INCOMPLETE DATA"])
df_201617.shape

(661, 161)

In [6]:
df_201718 = pd.read_csv(f'{PATH}Chicago_Public_Schools_-_School_Progress_Reports_SY1718.csv', 
                     low_memory=False, na_values = ["NOT ENOUGH DATA", "NOT APPLICABLE", "INCOMPLETE DATA"])
df_201718.shape

(661, 163)

In [7]:
df_201819 = pd.read_csv(f'{PATH}Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv', 
                     low_memory=False, na_values = ["NOT ENOUGH DATA", "NOT APPLICABLE", "INCOMPLETE DATA"])
df_201819.shape

(654, 182)

With the raw data now loaded into memory, I will review the data as formatted to remove any columns that are redundant, merely descriptive, or otherwise hold no predictive value.  First, however, I will implement a basic `display_all()` function to get full-row readouts of the DataFrames.

In [8]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [9]:
display_all(df_201516.tail().T)

Unnamed: 0,665,666,667,668,669
School_ID,610173,610115,609996,609815,400135
Short_Name,SHERWOOD,PARK MANOR,HOLDEN,BOUCHET,YCCS - LATINO YOUTH
Long_Name,Jesse Sherwood Elementary School,Park Manor Elementary School,Charles N Holden Elementary School,Edward A Bouchet Math & Science Academy ES,YCCS-Latino Youth Alternative HS
School_Type,Neighborhood,Neighborhood,Neighborhood,Neighborhood,Charter
Primary_Category,ES,ES,ES,ES,HS
Address,245 W 57TH ST,7037 S RHODES AVE,1104 W 31ST ST,7355 S JEFFERY BLVD,2001 S CALIFORNIA AVE
City,Chicago,Chicago,Chicago,Chicago,Chicago
State,Illinois,Illinois,Illinois,Illinois,Illinois
Zip,60621,60637,60608,60649,60608
Phone,7.73535e+09,7.73535e+09,7.73536e+09,7.73535e+09,7.73648e+09


My overall goal is to retain as much of the data as possible, since the Random Forest models will be able to key into key variables even if purely extraneous data is retained in the input datasets.  With that said, we can eliminate information regarding the street address, website, phone number, etc. for a school.

In [10]:
cols_to_drop = ['Address','Short_Name','Fax', 'City','State','Phone','CPS_School_Profile','Website', 'Empty_Progress_Report_Message', 'State_School_Report_Card_URL', 'Location', 'Zip']
df_201516.drop(cols_to_drop, axis = 1, inplace = True)

In [11]:
display_all(df_201516.tail().T)

Unnamed: 0,665,666,667,668,669
School_ID,610173,610115,609996,609815,400135
Long_Name,Jesse Sherwood Elementary School,Park Manor Elementary School,Charles N Holden Elementary School,Edward A Bouchet Math & Science Academy ES,YCCS-Latino Youth Alternative HS
School_Type,Neighborhood,Neighborhood,Neighborhood,Neighborhood,Charter
Primary_Category,ES,ES,ES,ES,HS
Progress_Report_Year,2015,2015,2015,2015,2015
Blue_Ribbon_Award_Year,,,,,
Excelerate_Award_Gold_Year,,,2015,,
Spot_Light_Award_Year,,,,,
Improvement_Award_Year,,2012,,,
Excellence_Award_Year,,,,,


There are large quantity of redundant columns - many of the data points give both a raw number and a percentage value.  I will drop the percentage values and retain the integer columns.  Additionally, there are "description" columns that provide insight into what the data represents but provide no data on the school itself - these columns will also be dropped.

In [12]:
lbl_col_list = df_201516.columns.tolist()
lbl_col_list = [x for x in lbl_col_list if 'Lbl' in x]
df_201516.drop(lbl_col_list, axis = 1, inplace = True)

label_col_list = df_201516.columns.tolist()
label_col_list = [x for x in label_col_list if 'Label' in x]
df_201516.drop(label_col_list, axis = 1, inplace = True)

description_col_list = df_201516.columns.tolist()
description_col_list = [x for x in description_col_list if 'Description' in x]
df_201516.drop(description_col_list, axis = 1, inplace = True)

There are also columns with information only relevant to high schools.  This data can also be removed since it will not be present for the schools within the scope of our analysis.

In [13]:
act_col_list = df_201516.columns.tolist()
act_col_list = [x for x in act_col_list if 'ACT' in x]
df_201516.drop(act_col_list, axis = 1, inplace = True)

freshman_col_list = df_201516.columns.tolist()
freshman_col_list = [x for x in freshman_col_list if 'Freshmen' in x]
df_201516.drop(freshman_col_list, axis = 1, inplace = True)

college_col_list = df_201516.columns.tolist()
college_col_list = [x for x in college_col_list if 'College' in x]
df_201516.drop(college_col_list, axis = 1, inplace = True)

In [14]:
display_all(df_201516.tail().T)

Unnamed: 0,665,666,667,668,669
School_ID,610173,610115,609996,609815,400135
Long_Name,Jesse Sherwood Elementary School,Park Manor Elementary School,Charles N Holden Elementary School,Edward A Bouchet Math & Science Academy ES,YCCS-Latino Youth Alternative HS
School_Type,Neighborhood,Neighborhood,Neighborhood,Neighborhood,Charter
Primary_Category,ES,ES,ES,ES,HS
Progress_Report_Year,2015,2015,2015,2015,2015
Blue_Ribbon_Award_Year,,,,,
Excelerate_Award_Gold_Year,,,2015,,
Spot_Light_Award_Year,,,,,
Improvement_Award_Year,,2012,,,
Excellence_Award_Year,,,,,


There are a few other final columns that are either redundant or irrevalent that I will remove.

In [15]:
final_remove_list = ['Parent_Survey_Results_Year', 'Supportive_School_Award_Desc', 'Progress_Toward_Graduation_Year_2','Progress_Toward_Graduation_Year_1','Other_Metrics_Year_2','Other_Metrics_Year_1']
df_201516.drop(final_remove_list, axis = 1, inplace = True)

With the columns now sufficiently cleaned based on description, we can remove *rows*.  Since this analysis is focused on elementary schools, I will drop all entries that pertain to high schools and middle schools.  

In [16]:
df_201516.drop(df_201516[df_201516.Primary_Category == 'HS'].index, axis = 0, inplace = True)
df_201516.drop(df_201516[df_201516.Primary_Category == 'MS'].index, axis = 0, inplace = True)
df_201516.shape

(472, 83)

I will now repeat this cleaning process over the other files before preparing a final join of the DataFrames.

In [17]:
#drop school description cols
cols_to_drop = ['Address','Short_Name','Fax', 'City','State','Phone','CPS_School_Profile','Website', 'Empty_Progress_Report_Message', 'State_School_Report_Card_URL', 'Location','Zip']
df_201617.drop(cols_to_drop, axis = 1, inplace = True)

In [18]:
#drop label and description columns
column_names = df_201617.columns.tolist()
lbl_col_list = [x for x in column_names if 'Lbl' in x]
df_201617.drop(lbl_col_list, axis = 1, inplace = True)

label_col_list = [x for x in column_names if 'Label' in x]
df_201617.drop(label_col_list, axis = 1, inplace = True)

description_col_list = [x for x in column_names if 'Description' in x]
df_201617.drop(description_col_list, axis = 1, inplace = True)

In [19]:
#drop hs-only columns
column_names = df_201617.columns.tolist()
act_col_list = [x for x in column_names if 'ACT' in x]
df_201617.drop(act_col_list, axis = 1, inplace = True)

freshman_col_list = [x for x in column_names if 'Freshmen' in x]
df_201617.drop(freshman_col_list, axis = 1, inplace = True)

college_col_list = [x for x in column_names if 'College' in x]
df_201617.drop(college_col_list, axis = 1, inplace = True)

In [20]:
final_remove_list = ['Parent_Survey_Results_Year', 'Supportive_School_Award_Desc', 'Progress_Toward_Graduation_Year_2','Progress_Toward_Graduation_Year_1','Other_Metrics_Year_2','Other_Metrics_Year_1']
df_201617.drop(final_remove_list, axis = 1, inplace = True)

In [21]:
df_201617.drop(df_201617[df_201617.Primary_Category == 'HS'].index, axis = 0, inplace = True)
df_201617.drop(df_201617[df_201617.Primary_Category == 'MS'].index, axis = 0, inplace = True)
df_201617.shape

(470, 91)

Since this preprocessing arrived at a diffferent total number of columns, I will examine the columns that were added into the dataset between 2015-16 and 2016-17.

In [22]:
list_2015_cols = df_201516.columns.tolist()
list_2016_cols = df_201617.columns.tolist()
diff_list = np.setdiff1d(list_2016_cols,list_2015_cols)
print(diff_list)

['Graduation_4_Year_CPS_Pct_Year_1' 'Graduation_4_Year_CPS_Pct_Year_2'
 'Graduation_4_Year_School_Pct_Year_1'
 'Graduation_4_Year_School_Pct_Year_2' 'Graduation_5_Year_CPS_Pct_Year_1'
 'Graduation_5_Year_CPS_Pct_Year_2' 'Graduation_5_Year_School_Pct_Year_1'
 'Graduation_5_Year_School_Pct_Year_2']


Since these new columns don't show values for elementary schools, I will drop them.

In [23]:
df_201617.drop(diff_list, axis = 1, inplace = True)

I will now prepare the 2017-18 data.

In [24]:
#drop school description cols
cols_to_drop = ['Short_Name','Fax', 'Phone','CPS_School_Profile','Website', 'Empty_Progress_Report_Message', 'State_School_Report_Card_URL', 'Location']
df_201718.drop(cols_to_drop, axis = 1, inplace = True)

#drop label and description columns
column_names = df_201718.columns.tolist()
lbl_col_list = [x for x in column_names if 'Lbl' in x]
df_201718.drop(lbl_col_list, axis = 1, inplace = True)

label_col_list = [x for x in column_names if 'Label' in x]
df_201718.drop(label_col_list, axis = 1, inplace = True)

description_col_list = [x for x in column_names if 'Description' in x]
df_201718.drop(description_col_list, axis = 1, inplace = True)

#drop hs-only columns
column_names = df_201718.columns.tolist()
act_col_list = [x for x in column_names if 'SAT' in x]
df_201718.drop(act_col_list, axis = 1, inplace = True)

freshman_col_list = [x for x in column_names if 'Freshmen' in x]
df_201718.drop(freshman_col_list, axis = 1, inplace = True)

college_col_list = [x for x in column_names if 'College' in x]
df_201718.drop(college_col_list, axis = 1, inplace = True)

final_remove_list = ['Parent_Survey_Results_Year', 'Supportive_School_Award_Desc', 'Progress_Toward_Graduation_Year_2','Progress_Toward_Graduation_Year_1','Other_Metrics_Year_2','Other_Metrics_Year_1']
df_201718.drop(final_remove_list, axis = 1, inplace = True)

df_201718.drop(df_201718[df_201718.Primary_Category == 'HS'].index, axis = 0, inplace = True)
df_201718.drop(df_201718[df_201718.Primary_Category == 'MS'].index, axis = 0, inplace = True)
df_201718.shape

(472, 91)

Again, there are additional columns from the previous year's report, so I will again review these added columns and remove or retain as apporpriate.

In [25]:
list_2016_cols = df_201617.columns.tolist()
list_2017_cols = df_201718.columns.tolist()
diff_list = np.setdiff1d(list_2017_cols,list_2016_cols).tolist()
print(diff_list)

['Graduation_4_Year_CPS_Pct_Year_1', 'Graduation_4_Year_CPS_Pct_Year_2', 'Graduation_4_Year_School_Pct_Year_1', 'Graduation_4_Year_School_Pct_Year_2', 'Graduation_5_Year_CPS_Pct_Year_1', 'Graduation_5_Year_CPS_Pct_Year_2', 'Graduation_5_Year_School_Pct_Year_1', 'Graduation_5_Year_School_Pct_Year_2', 'School_Survey_Parent_Response_Rate_Avg_Pct', 'School_Survey_Parent_Response_Rate_Pct']


Only one of these values apepars worth retaining, so I will preserve the Parent Response Rate column and remove the others.

In [26]:
diff_list.remove('School_Survey_Parent_Response_Rate_Pct')
df_201718.drop(diff_list, axis = 1, inplace = True)
df_201718.shape

(472, 82)

Repeating this process, I move into the 2018-19 data.

In [27]:
#drop school description cols
cols_to_drop = ['Short_Name','Fax', 'Phone','City', 'Address','State', 'Zip', 'CPS_School_Profile','Website', 'Empty_Progress_Report_Message', 'State_School_Report_Card_URL']
df_201819.drop(cols_to_drop, axis = 1, inplace = True)

#drop label and description columns
column_names = df_201819.columns.tolist()
lbl_col_list = [x for x in column_names if 'Lbl' in x]
df_201819.drop(lbl_col_list, axis = 1, inplace = True)

label_col_list = [x for x in column_names if 'Label' in x]
df_201819.drop(label_col_list, axis = 1, inplace = True)

description_col_list = [x for x in column_names if 'Description' in x]
df_201819.drop(description_col_list, axis = 1, inplace = True)

#drop hs-only columns
column_names = df_201819.columns.tolist()
act_col_list = [x for x in column_names if 'SAT' in x]
df_201819.drop(act_col_list, axis = 1, inplace = True)

freshman_col_list = [x for x in column_names if 'Freshmen' in x]
df_201819.drop(freshman_col_list, axis = 1, inplace = True)

college_col_list = [x for x in column_names if 'College' in x]
df_201819.drop(college_col_list, axis = 1, inplace = True)

final_remove_list = ['Parent_Survey_Results_Year', 'Supportive_School_Award_Desc', 'Progress_Toward_Graduation_Year_2','Progress_Toward_Graduation_Year_1','Other_Metrics_Year_2','Other_Metrics_Year_1']
df_201819.drop(final_remove_list, axis = 1, inplace = True)

df_201819.drop(df_201819[df_201819.Primary_Category == 'HS'].index, axis = 0, inplace = True)
df_201819.drop(df_201819[df_201819.Primary_Category == 'MS'].index, axis = 0, inplace = True)
df_201819.shape

(470, 94)

In [28]:
list_2017_cols = df_201718.columns.tolist()
list_2018_cols = df_201819.columns.tolist()
diff_list = np.setdiff1d(list_2018_cols,list_2017_cols).tolist()
print(diff_list)

['Attainment_All_Grades_School_Pct', 'Graduation_4_Year_CPS_Pct_Year_1', 'Graduation_4_Year_CPS_Pct_Year_2', 'Graduation_4_Year_School_Pct_Year_1', 'Graduation_4_Year_School_Pct_Year_2', 'Graduation_5_Year_CPS_Pct_Year_1', 'Graduation_5_Year_CPS_Pct_Year_2', 'Graduation_5_Year_School_Pct_Year_1', 'Graduation_5_Year_School_Pct_Year_2', 'School_Latitude', 'School_Longitude', 'School_Survey_Parent_Response_Rate_Avg_Pct']


In [29]:
df_201819.drop(diff_list, axis = 1, inplace = True)
df_201819.shape

(470, 82)

Finally, we can combine the datasets into a single DataFrame for analysis.

In [30]:
list_2017_cols = df_201718.columns.tolist()
list_2018_cols = df_201819.columns.tolist()
diff_list = np.setdiff1d(list_2018_cols,list_2017_cols).tolist()
print(diff_list)

[]


In [31]:
list_2016_cols = df_201617.columns.tolist()
list_2017_cols = df_201718.columns.tolist()
diff_list = np.setdiff1d(list_2017_cols,list_2016_cols).tolist()
print(diff_list)

['School_Survey_Parent_Response_Rate_Pct']


In [32]:
df_raw = df_201819.append(df_201718).append(df_201617).append(df_201516)
df_raw.reset_index(inplace = True)
display_all(df_raw.tail().T)

Unnamed: 0,1879,1880,1881,1882,1883
index,664,665,666,667,668
School_ID,610339,610173,610115,609996,609815
Long_Name,Amos Alonzo Stagg Elementary School,Jesse Sherwood Elementary School,Park Manor Elementary School,Charles N Holden Elementary School,Edward A Bouchet Math & Science Academy ES
School_Type,Neighborhood,Neighborhood,Neighborhood,Neighborhood,Neighborhood
Primary_Category,ES,ES,ES,ES,ES
Progress_Report_Year,2015,2015,2015,2015,2015
Blue_Ribbon_Award_Year,,,,,
Excelerate_Award_Gold_Year,,,,2015,
Spot_Light_Award_Year,,,,,
Improvement_Award_Year,,,2012,,


In [33]:
df_raw.shape

(1884, 85)

Since we have already utilized the data present in Primary Category, we can drop that column.  I will also remove a few average percentage columns that slipped through the cracks so far but hold only a single value per calendar year.

In [34]:
df_raw.drop(['index','Average_Length_Suspension_Avg_Pct','Teacher_Attendance_Avg_Pct','Suspensions_Per_100_Students_Avg_Pct','Student_Attendance_Avg_Pct','One_Year_Dropout_Rate_Avg_Pct'], axis = 1, inplace = True)

## Dataset Exporting for SQL Table Creation

To leverage this dataset for ad hoc analysis, I will export it into a set of csv files that would easily allow for creating a SQL database.  To start, I'll take out the school location data and place it into a seperate table for export.

In [44]:
#export data
lat_long_df = df_raw[['School_ID', 'School_Latitude', 'School_Longitude']].copy()

#drop NaN values
lat_long_df.dropna(subset = ['School_Latitude','School_Longitude'], inplace = True)

#dedupe list
lat_long_df.drop_duplicates(inplace = True)

#push list to csv
lat_long_df.to_csv(f'{PATH}SQL/school_locations.csv', index = False)

With this data exported, I will use this table to fill in missing location values in `df_raw`.

In [46]:
df_raw.loc[10,'School_Latitude']

nan

In [68]:
lat_long_df.loc[lat_long_df.School_ID == 610115, 'School_Latitude']

1091    41.766477
Name: School_Latitude, dtype: float64

In [71]:
lat_long_df.set_index('School_ID')['School_Latitude']

School_ID
610104    41.993790
610070    41.967847
610251    41.882950
610268    41.740184
610089    41.950078
            ...    
609821    41.714402
610305    41.873587
609874    41.973715
610584    41.883606
400072    41.752458
Name: School_Latitude, Length: 472, dtype: float64

In [73]:
df_raw.School_ID.nunique()

476

In [65]:
df1['Weight'] = df1['ID'].map(df2.set_index('ID')['Value'])

In [None]:
df_raw['School_Latitude'] = lat_long_df.map()

In [66]:
df_raw.tail()

Unnamed: 0,School_ID,Long_Name,School_Type,Primary_Category,Progress_Report_Year,Blue_Ribbon_Award_Year,Excelerate_Award_Gold_Year,Spot_Light_Award_Year,Improvement_Award_Year,Excellence_Award_Year,...,Student_Attendance_Year_2_Pct,Teacher_Attendance_Year_1_Pct,Teacher_Attendance_Year_2_Pct,One_Year_Dropout_Rate_Year_1_Pct,One_Year_Dropout_Rate_Year_2_Pct,Mobility_Rate_Pct,Chronic_Truancy_Pct,Supportive_School_Award,School_Latitude,School_Longitude
1879,610339,Amos Alonzo Stagg Elementary School,Neighborhood,ES,2015,,,,,,...,95.9,96.6,95.5,,,25.5,,NOT RATED,unknown,-87.649217
1880,610173,Jesse Sherwood Elementary School,Neighborhood,ES,2015,,,,,,...,95.9,97.6,96.3,,,32.3,,NOT RATED,unknown,-87.632279
1881,610115,Park Manor Elementary School,Neighborhood,ES,2015,,,,2012.0,,...,96.1,95.5,96.1,,,41.2,,NOT RATED,unknown,-87.611546
1882,609996,Charles N Holden Elementary School,Neighborhood,ES,2015,,2015.0,,,,...,94.8,95.5,94.3,,,14.2,,NOT RATED,unknown,-87.653792
1883,609815,Edward A Bouchet Math & Science Academy ES,Neighborhood,ES,2015,,,,,,...,93.9,96.2,95.8,,,32.8,,NOT RATED,unknown,-87.576075


In [None]:
df_raw.to_csv(f'{PATH}report_card_2015-2019.csv')

# Joined Dataset Preprocessing

With this data now purged, I will now address the various categorical columns and prepare them for processing by the ``RandomForestClassifier()`` - using the `DataFrameSummary()` package will make this process much easier.  But first, I will eliminate the whitespace that appears at the end of some of the column names to make future column references easier to manage.

In [None]:
df_raw.columns = [col.strip() for col in df_raw.columns]

In [None]:
dfs = DataFrameSummary(df_raw)

In [None]:
dfs.columns_types

## Boolean Column Preprocessing

Starting with the smallest subset of values, I will examine the boolean columns first.

In [None]:
dfs.bools

In [None]:
df_raw.Spot_Light_Award_Year.value_counts()

Since there are so few schools who won this award - and one can only presume that having received the award in 2012 vs 2013 would make little difference - I will convert this column into an indicator variable.

In [None]:
df_raw['Won_Spot_Light_Award'] = 0
df_raw.loc[df_raw.Spot_Light_Award_Year.notnull(), 'Won_Spot_Light_Award'] = 1
df_raw.drop('Spot_Light_Award_Year', axis = 1, inplace = True)
df_raw.Won_Spot_Light_Award.value_counts()

After confirming that this column has been transformed, I dropped the original column.  I will now examine the other boolean columns as well as other "Award Year" columns.  Assuming these columns are also easily transformed into indicator variables, I will create a new column indicating if the award had been given and then drop the original column.

In [None]:
df_raw.Improvement_Award_Year.value_counts()

In [None]:
df_raw['Won_Improvement_Award'] = 0
df_raw.loc[df_raw.Improvement_Award_Year.notnull(), 'Won_Improvement_Award'] = 1
df_raw.drop('Improvement_Award_Year', axis = 1, inplace = True)
df_raw.Won_Improvement_Award.value_counts()

In [None]:
df_raw.Excellence_Award_Year.value_counts()

In [None]:
df_raw['Won_Excellence_Award'] = 0
df_raw.loc[df_raw.Excellence_Award_Year.notnull(), 'Won_Excellence_Award'] = 1
df_raw.drop('Excellence_Award_Year', axis = 1, inplace = True)
df_raw.Won_Excellence_Award.value_counts()

In [None]:
df_raw.Blue_Ribbon_Award_Year.value_counts()

In [None]:
df_raw['Won_Blue_Ribbon_Award'] = 0
df_raw.loc[df_raw.Blue_Ribbon_Award_Year.notnull(), 'Won_Blue_Ribbon_Award'] = 1
df_raw.drop('Blue_Ribbon_Award_Year', axis = 1, inplace = True)
df_raw.Won_Blue_Ribbon_Award.value_counts()

In [None]:
df_raw.Excelerate_Award_Gold_Year.value_counts()

In [None]:
df_raw['Won_Excelerate_Award_Gold'] = 0
df_raw.loc[df_raw.Excelerate_Award_Gold_Year.notnull(), 'Won_Excelerate_Award_Gold'] = 1
df_raw.drop('Excelerate_Award_Gold_Year', axis = 1, inplace = True)
df_raw.Won_Excelerate_Award_Gold.value_counts()

## Categorical Column Preprocessing

With these boolean columns now dispositioned for analysis, I turn my attention to the categorical columns.

In [None]:
dfs.categoricals

In this list of categoricals, we see a few entries that clearly appear to be numerical.  I will examine these entries and make the necessary changes  so the values are properly represented in numerical columns.

In [None]:
df_raw.Average_Length_Suspension_Year_1_Pct.value_counts()

In [None]:
for idx, val in df_raw.iterrows():
    try:
        df_raw.loc[idx,'Average_Length_Suspension_Year_1_Pct'] = df_raw.Average_Length_Suspension_Year_1_Pct.loc[idx].replace(' days', '')
    except:
        pass
    try:
        df_raw.loc[idx,'Average_Length_Suspension_Year_2_Pct'] = df_raw.Average_Length_Suspension_Year_2_Pct.loc[idx].replace(' days', '')
    except:
        pass
        

In [None]:
sum(df_raw.Average_Length_Suspension_Year_1_Pct.isnull()), sum(df_raw.Average_Length_Suspension_Year_2_Pct.isnull())

For these two columns, we are not missing an overly signficant number of values.  I will impute the mean for all missing values, but also mark out which schools did not provide this data.

In [None]:
df_raw["Missing_Suspension_Data"] = 0

df_raw.loc[df_raw.Average_Length_Suspension_Year_1_Pct.isnull(),'Missing_Suspension_Data'] = 1
df_raw.loc[df_raw.Average_Length_Suspension_Year_2_Pct.isnull(),'Missing_Suspension_Data'] = 1

In [None]:
df_raw.Missing_Suspension_Data.value_counts()

In [None]:
df_raw.Average_Length_Suspension_Year_1_Pct = df_raw.Average_Length_Suspension_Year_1_Pct.astype('float')
df_raw.Average_Length_Suspension_Year_2_Pct = df_raw.Average_Length_Suspension_Year_2_Pct.astype('float')

In [None]:
year_1_mean = np.around(df_raw.Average_Length_Suspension_Year_1_Pct.mean(),2)
year_2_mean = np.around(df_raw.Average_Length_Suspension_Year_2_Pct.mean(),2)
year_1_mean, year_2_mean

In [None]:
df_raw.Average_Length_Suspension_Year_1_Pct.fillna(year_1_mean, inplace = True)
df_raw.Average_Length_Suspension_Year_2_Pct.fillna(year_2_mean, inplace = True)

I will now address the next column that appears to be misclassified, the 'School_Survey_Parent_Response_Rate_Pct' column.

In [None]:
df_raw.School_Survey_Parent_Response_Rate_Pct.value_counts()

In this response rate column, we see a hodgepodge of values.  I will strip the '%' sign from all entries, and assign the ' < 30%' and '> 75%' to 29 and 76, respectively - the Random Forest will be able to detect those differences very easily in assessing this column for predictive value.

In [None]:
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct == '< 30%','School_Survey_Parent_Response_Rate_Pct'] = 29
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct == '> 75%','School_Survey_Parent_Response_Rate_Pct'] = 76
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct == '.','School_Survey_Parent_Response_Rate_Pct'] = 0
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct.isnull(),'School_Survey_Parent_Response_Rate_Pct'] = 0
df_raw.School_Survey_Parent_Response_Rate_Pct = df_raw.School_Survey_Parent_Response_Rate_Pct.apply(lambda x: int(str(x).replace('%','')))


In [None]:
df_raw.School_Survey_Parent_Response_Rate_Pct.value_counts()

There are a significant number of missing values here - recall from above that this data was either not collected or not reported in the first two school years covered by my analysis - so I will impute the mean for missing values and then tag those rows with missing data.

In [None]:
mean_rate = round(df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct > 0, 'School_Survey_Parent_Response_Rate_Pct'].mean(), 1)
df_raw['Has_Parent_Response_Rate'] = 1
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct == 0, 'Has_Parent_Response_Rate'] = 0
df_raw.loc[df_raw.School_Survey_Parent_Response_Rate_Pct == 0, 'School_Survey_Parent_Response_Rate_Pct'] = mean_rate

df_raw.School_Survey_Parent_Response_Rate_Pct.value_counts()

In [None]:
df_raw.Has_Parent_Response_Rate.value_counts()

In [None]:
dfs = DataFrameSummary(df_raw)
dfs.categoricals

In [None]:
df_raw.Supportive_School_Award.value_counts()

In [None]:
df_raw.Supportive_School_Award = df_raw.Supportive_School_Award.apply(lambda x: x.title())
df_raw.loc[df_raw.Supportive_School_Award == 'Coming Soon','Supportive_School_Award'] = 'Not Rated'

In [None]:
df_raw.Supportive_School_Award.value_counts()

In [None]:
def train_cats(df, col_list):
    for col_name in col_list:
        df[col_name] = df[col_name].astype("category")
    return df

In [None]:
cat_cols = ['School_Type', 'Student_Growth_Rating',
       'Student_Attainment_Rating', 'Culture_Climate_Rating',
       'Healthy_School_Certification', 'Creative_School_Certification',
       'School_Survey_Involved_Families',
       'School_Survey_Supportive_Environment',
       'School_Survey_Ambitious_Instruction',
       'School_Survey_Effective_Leaders',
       'School_Survey_Collaborative_Teachers', 'School_Survey_Safety',
       'School_Survey_School_Community',
       'School_Survey_Parent_Teacher_Partnership',
       'School_Survey_Quality_Of_Facilities', 'Supportive_School_Award']

In [None]:
df_raw = train_cats(df_raw, cat_cols)

Many of the categorical columns contain categories that possess an obvious order, but by default, pandas will assign the numerical coding for the category based on an alphabetical ordering.  To override this default, I will reorder the categories in each of these columns in their logical order, i.e. from "Very Weak" to "Very Strong."

In [None]:
#fix capitalization
df_raw['School_Survey_Quality_Of_Facilities'].value_counts()

In [None]:
#fix capitalization
df_raw['School_Survey_Parent_Teacher_Partnership'].value_counts()

In [None]:
#fix capitalization
df_raw['School_Survey_School_Community'].value_counts()

In [None]:
df_raw['Creative_School_Certification'].value_counts()

In [None]:
avg_text_vals = ["NO DATA AVAILABLE","FAR BELOW AVERAGE", "BELOW AVERAGE","AVERAGE","ABOVE AVERAGE","FAR ABOVE AVERAGE"]
df_raw['Student_Growth_Rating'].cat.set_categories(avg_text_vals, ordered=True, inplace=True)
df_raw['Student_Attainment_Rating'].cat.set_categories(avg_text_vals, ordered=True, inplace=True)

In [None]:
df_raw['Culture_Climate_Rating'].value_counts()

In [None]:
df_raw['Involved Family'].value_counts()

In [None]:
organized_text_vals = ["NOT YET ORGANIZED","PARTIALLY ORGANIZED", "MODERATELY ORGANIZED","ORGANIZED","WELL-ORGANIZED"]
df_raw['Culture_Climate_Rating'].cat.set_categories(organized_text_vals, ordered=True, inplace=True)

In [None]:
weak_to_strong_cats = ["VERY WEAK", "WEAK", "NEUTRAL", "STRONG", "VERY STRONG"]

def set_cats_weak_to_strong(df, col_list):
    for col in col_list:
        df[col].cat.set_categories(weak_to_strong_cats, ordered=True, inplace=True)            

In [None]:
col_list = [
    'School_Survey_Involved_Families','School_Survey_Supportive_Environment',
    'School_Survey_Ambitious_Instruction','School_Survey_Effective_Leaders',
    'School_Survey_Collaborative_Teachers','School_Survey_Safety'
           ]

In [None]:
df_raw['Healthy_School_Certification'].value_counts()

In [None]:
df_raw["Involved Family"].cat.codes

In [None]:
df_Raw['Involved Family'] = df.cc.cat.codes

Since the goal of this project is to predict the overall performance level of the school from the report card data to determine which features of the report card best predict the score, I will drop any rows which lack a value for this column.

In [None]:
df_raw['CPS Performance Policy Level'].isnull().index

In [None]:
df_raw.dropna(subset=['CPS Performance Policy Level'], axis = 0, inplace = True). 

In [None]:
df_raw.shape

With the categorical columns now processed and rows with null values for the predictor variable dropped, I will turn my attention to other columns with missing values.  Pandas will automatically assign an integer value of $-1$ to missing categorical values, so those columns can be considered dispositioned, but I will process missing values for continuous or integer columns by setting the numeric to "0" and also establishing an "_is_missing" column to ensure any predictive value of that number being absent from the dataset is captured by the Random Forest.

In [None]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
df_raw["Average Length of Suspension 2012"].iloc[0] 

In [None]:
len(df_raw[df_raw["Average Length of Suspension 2012"].isnull()].index.tolist())



In [None]:
col = "School Community"
df_raw[df_raw[col].isnull()].index.tolist()

col + "_test"

In [None]:
def clean_missing_numerical_col(df, col_list):
    for col in col_list:
        new_col_name = col + "_is_missing"
        
        df[new_col_name] = 0
        missing_col_idxes = df[df[col].isnull()].index.tolist()
        for idx in missing_col_idxes:
            df[new_col_name].iloc[idx] = 1
        
        df.col.fillna(0, inplace = True)

In [None]:
test_col_list = ["Average Length of Suspension 2012"]

clean_missing_numerical_col(df_raw, test_col_list)

In [None]:
setIconCats(df, icon_cols)

#confirm output
df["Teachers Icon "].cat.categories

With this forced ordering now in place, I will transform all categoricals into their integer values.

In [None]:
display_all(df.describe(include='all').T)

In [None]:
dfs = DataFrameSummary(df_raw)
display_all(dfs.missing_frac)