In [None]:
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
import pandas as pd
from scipy import stats


Witht his script, we explore the performance of schools in terms of test registration and actual test taking. A very good way to compare the schools is to determine their performance in percentiles. 
First, we will read in and polish the data and then explore their distribution. 

In [6]:
# function file to data frame
def file_to_df(file):
    filename, file_extension = os.path.splitext(file)
    if file_extension=='.csv':
        df = pd.read_csv(file, sep=',', header=0)
    elif file_extension=='.tsv':
        df = pd.read_csv(file, sep='\t', header=0)
    else:
        print('Please provide csv or tsv file format.')
    return df


In [7]:
# read in the data and give the columns useful names
df = file_to_df('D5_SHSAT_Registrations_and_Testers.csv')
colnames = ['DBN','school_name','year','grade','october_school_enrollment','registered_for_test','took_test']
df.columns = colnames
df = df.sort_values(by=['registered_for_test'])
print(df.head())


        DBN                                        school_name  year  grade  \
112  84M350                      Democracy Prep Charter School  2013      9   
76   05M670  Thurgood Marshall Academy for Learning and Soc...  2015      9   
126  84M481               Democracy Prep Harlem Charter School  2014      9   
113  84M350                      Democracy Prep Charter School  2014      9   
52   05M369      Urban Assembly School for the Performing Arts  2016      9   

     october_school_enrollment  registered_for_test  took_test  
112                        124                    0          0  
76                          98                    0          0  
126                        119                    0          0  
113                        122                    0          0  
52                         118                    0          0  


In fact, for a fair comparison, we have to gauge all measures (like registrations to the test) to the overall pupils in the school


In [8]:
# calculate the ratios
df['enroll_to_take_ratio'] = np.nan_to_num(df['took_test']/df['october_school_enrollment']*100.00)
df['register_to_take_ratio'] = df['took_test']/df['registered_for_test']*100.00
df['register_to_take_ratio'] = df['register_to_take_ratio'].dropna()
df = df.sort_values(by=['year','enroll_to_take_ratio'],ascending=True)


For a standardized view of the distribution of the data, we use percentiles.


In [9]:
def get_percentiles(column, bins_percentile = [0,20,40,60,80,100]):
    # data_percentile = 100*((stats.rankdata(column, 'min')-1)/len(x))
    data_percentile = 100*column.rank(pct=True, method='min')
    steps = 100/len(bins_percentile)-1
    bins = steps*np.digitize(data_percentile, bins_percentile, right=True)
    return data_percentile, bins

In [10]:
df['enroll_to_take_ratio_percentiles'] = get_percentiles(df['enroll_to_take_ratio'])[0]
df['enroll_to_take_ratio_bins'] = get_percentiles(df['enroll_to_take_ratio'])[1]
df['register_to_take_ratio_percentiles'] = get_percentiles(df['register_to_take_ratio'])[0]
df['register_to_take_ratio_bins'] = get_percentiles(df['register_to_take_ratio'])[1]

In [11]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = df['year']
y = df['enroll_to_take_ratio_bins']
z = df['enroll_to_take_ratio']
ax.scatter(x, y, z)
plt.ylim((0,100))
ax.set_xticks(np.arange(min(x), max(x)+1, 1.0))
ax.set_yticks(np.arange(10, 100, 20.0))
plt.xlabel(r'Year', fontsize=16)
plt.ylabel(r'Percentile', fontsize=16)
plt.title(r'Percent pupils taking test', fontsize=16)
plt.show()


This figures shows us that the worst performing schools in terms of pupils taking the test have no pupils at all taking the test, whereas the best schools have a 40% test participation. 


In [12]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = df['year']
y = df['register_to_take_ratio_bins']
z = df['register_to_take_ratio']
ax.scatter(x, y, z)
plt.ylim((0,100))
ax.set_xticks(np.arange(min(x), max(x)+1, 1.0))
ax.set_yticks(np.arange(0, 100, 20.0))
plt.title(r'Percent registered pupils taking test', fontsize=16)
plt.show()


Another more specific aim would be to increase the number of pupils that registered for the test to actually take the test. In the lowest percentile bins, less than 50% of registered students actually take the test. 

In [13]:
df['register_percentile'] = get_percentiles(df['registered_for_test'])[0]
df['took_test_percentile'] = get_percentiles(df['took_test'])[0]
# print(df.shape)
# df_low_registrations = df[df['register_percentile'] < 30.]
# print(df_low_registrations.shape)
# print(df_low_registrations.tail())

file_name = 'augmented_D5_SHSAT_Registrations_and_Testers.csv'
df.to_csv(file_name)
