# PyCity Schools Analysis

* Reading scores were consistently higher than math scores among both lower- and higher-performing schools. Only smaller and charter schools almost closed this gap.

* Larger schools on average spent more per student than smaller schools. This could indicate that they have a student population that has higher needs, or it may indicate some are not leveraging their scale to produce efficiencies.

* Looking at grade averages is not a good predictor of passing rate. Charter schools had an average reading score of 84 but achieved a 96.5% passing rate, while distict schools had an average of 81 with only an 81% passing rate. Median score might provide a more useful measure.
---

In [3]:
import pandas as pd


In [4]:
# read data
schools = pd.read_csv('./raw_data/schools_complete.csv')
students = pd.read_csv('./raw_data/students_complete.csv')

## District Summary

In [23]:
# District-level school data
num_schools = schools['School ID'].count()
total_budget = schools['budget'].sum()

# Merge data (student-level)
school_students = pd.merge(schools, students, left_on='name', right_on='school', how='left')
school_students.rename(columns={'name_x': 'School Name', 'name_y': 'Student Name', 'grade': 'Grade'}, inplace=True)

# Add passing columns, assuming >= 70 is passing and one must pass both reading and math to pass overall
school_students['Passing Reading'] = school_students['reading_score'] >= 70
school_students['Passing Math'] = school_students['math_score'] >= 70
school_students['Passing Overall'] = school_students['Passing Math'] & school_students['Passing Reading']

# Get district-level passing counts
pass_math = school_students[school_students['Passing Math']]['Student ID'].count()
pass_reading = school_students[school_students['Passing Reading']]['Student ID'].count()
pass_total = school_students[school_students['Passing Overall']]['Student ID'].count()

# Total-level group includes count of students and average math and reading scores.
funcs = {'Student ID': 'count', 'math_score': 'mean', 'reading_score': 'mean'}

# Hacky way to group without a groupby column (aggregate to district level)
ss_total_group = school_students.groupby(by=lambda x: 0)

# aggregate to count students and get means of math and reading scores
ss_total = ss_total_group.agg(funcs)
ss_total.rename(columns={'reading_score': 'Average Reading Score', 'math_score': 'Average Math Score'
                        ,'Student ID': 'Total Students'}, inplace=True)

# Merge student- and school-level data
ss_total['% Passing Math'] = pass_math / ss_total['Total Students']
ss_total['% Passing Reading'] = pass_reading / ss_total['Total Students']
ss_total['% Passing Overall'] = pass_total / ss_total['Total Students']
ss_total['Total Budget'] = total_budget
ss_total['Total Schools'] = num_schools
ss_total_formatted = ss_total
ss_total[['Total Schools', 'Total Students', 'Total Budget', 'Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading', '% Passing Overall']]

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
0,15,39170,24649428,78.985371,81.87784,0.749809,0.858055,0.651723


## School Summary

In [24]:
# school-level data
school_result = schools[['name', 'type', 'budget']]

# student-level data by school
school_students_gb = school_students.groupby(by='School ID')
funcs = {'Student ID': 'count', 'math_score': 'mean', 'reading_score': 'mean', 'Passing Math': 'sum', 'Passing Reading': 'sum', 'Passing Overall': 'sum'}#, 'Passing Reading': 'count', 'pass_overall': 'count'}

school_totals = school_students_gb.agg(funcs)

school_result = pd.merge(schools[['name', 'type', 'budget']], school_totals, left_index=True, right_index=True)
school_result['Per Student Budget'] = school_result['budget'] / school_result['Student ID']
school_result['% Passing Math'] = school_result['Passing Math'] / school_result['Student ID']
school_result['% Passing Reading'] = school_result['Passing Reading'] / school_result['Student ID']
school_result['% Passing Overall'] = school_result['Passing Overall'] / school_result['Student ID']
# Make school name the index, rename columns, and show result
school_result.rename(columns={'name': 'Name', 'type': 'Type', 'budget': 'Budget','math_score': 'Math Score',
                              'reading_score': 'Reading Score'}, inplace=True)
school_result.set_index(keys='Name', inplace=True)
school_result

Unnamed: 0_level_0,Type,Budget,Reading Score,Passing Math,Student ID,Passing Overall,Math Score,Passing Reading,Per Student Budget,% Passing Math,% Passing Reading,% Passing Overall
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Huang High School,District,1910635,81.182722,1916.0,2917,1561.0,76.629414,2372.0,655.0,0.656839,0.813164,0.535139
Figueroa High School,District,1884411,81.15802,1946.0,2949,1569.0,76.711767,2381.0,639.0,0.659885,0.807392,0.532045
Shelton High School,Charter,1056600,83.725724,1653.0,1761,1583.0,83.359455,1688.0,600.0,0.938671,0.958546,0.898921
Hernandez High School,District,3022020,80.934412,3094.0,4635,2481.0,77.289752,3748.0,652.0,0.66753,0.80863,0.535275
Griffin High School,Charter,917500,83.816757,1371.0,1468,1330.0,83.351499,1426.0,625.0,0.933924,0.97139,0.905995
Wilson High School,Charter,1319574,83.989488,2143.0,2283,2068.0,83.274201,2204.0,578.0,0.938677,0.965396,0.905826
Cabrera High School,Charter,1081356,83.97578,1749.0,1858,1697.0,83.061895,1803.0,582.0,0.941335,0.970398,0.913348
Bailey High School,District,3124928,81.033963,3318.0,4976,2719.0,77.048432,4077.0,628.0,0.666801,0.819333,0.546423
Holden High School,Charter,248087,83.814988,395.0,427,381.0,83.803279,411.0,581.0,0.925059,0.962529,0.892272
Pena High School,Charter,585858,84.044699,910.0,962,871.0,83.839917,923.0,609.0,0.945946,0.959459,0.905405


## Top Performing Schools (By Passing Rate)

In [41]:
# sort by passing overall descending to get top performing
top_performing = school_result.sort_values(by='% Passing Overall', ascending=False).head()
top_performing

Unnamed: 0_level_0,Type,Budget,Reading Score,Passing Math,Student ID,Passing Overall,Math Score,Passing Reading,Per Student Budget,% Passing Math,% Passing Reading,% Passing Overall,School Size,Budget per Student Range
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Cabrera High School,Charter,1081356,83.97578,1749.0,1858,1697.0,83.061895,1803.0,582.0,0.941335,0.970398,0.913348,Medium (1000 - 2000),<$585
Thomas High School,Charter,1043130,83.84893,1525.0,1635,1487.0,83.418349,1591.0,638.0,0.932722,0.973089,0.90948,Medium (1000 - 2000),$615-645
Griffin High School,Charter,917500,83.816757,1371.0,1468,1330.0,83.351499,1426.0,625.0,0.933924,0.97139,0.905995,Medium (1000 - 2000),$615-645
Wilson High School,Charter,1319574,83.989488,2143.0,2283,2068.0,83.274201,2204.0,578.0,0.938677,0.965396,0.905826,Large (2000 - 5000),<$585
Pena High School,Charter,585858,84.044699,910.0,962,871.0,83.839917,923.0,609.0,0.945946,0.959459,0.905405,Small (< 1000),$585-615


## Bottom Performing Schools (By Passing Rate)

In [26]:
# sort by passing overall ascending to get bottom performing
bottom_performing = school_result.sort_values(by='% Passing Overall').head()
bottom_performing

Unnamed: 0_level_0,Type,Budget,Reading Score,Passing Math,Student ID,Passing Overall,Math Score,Passing Reading,Per Student Budget,% Passing Math,% Passing Reading,% Passing Overall
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Rodriguez High School,District,2547363,80.744686,2654.0,3999,2119.0,76.842711,3208.0,637.0,0.663666,0.802201,0.529882
Figueroa High School,District,1884411,81.15802,1946.0,2949,1569.0,76.711767,2381.0,639.0,0.659885,0.807392,0.532045
Huang High School,District,1910635,81.182722,1916.0,2917,1561.0,76.629414,2372.0,655.0,0.656839,0.813164,0.535139
Hernandez High School,District,3022020,80.934412,3094.0,4635,2481.0,77.289752,3748.0,652.0,0.66753,0.80863,0.535275
Johnson High School,District,3094650,80.966394,3145.0,4761,2549.0,77.072464,3867.0,650.0,0.660576,0.812224,0.535392


## Math Scores by Grade

In [30]:
# Pivot to get passing math by school name and grade
pd.pivot_table(school_students, index='School Name', columns='Grade', values='Passing Math')

Grade,10th,11th,12th,9th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,0.663438,0.684253,0.642996,0.671468
Cabrera High School,0.939914,0.923237,0.950262,0.952652
Figueroa High School,0.665793,0.653032,0.68599,0.641355
Ford High School,0.693503,0.687405,0.654917,0.689076
Griffin High School,0.940887,0.941828,0.928082,0.924205
Hernandez High School,0.667482,0.668199,0.667377,0.667149
Holden High School,0.929825,0.912621,0.951807,0.913386
Huang High School,0.634941,0.647712,0.661538,0.68128
Johnson High School,0.651182,0.669449,0.650641,0.667857
Pena High School,0.944,0.960938,0.950276,0.930909


## Reading Score by Grade 

In [31]:
# Pivot to get passing reading by school name and grade
pd.pivot_table(school_students, index='School Name', columns='Grade', values='Passing Reading')

Grade,10th,11th,12th,9th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,0.835351,0.805755,0.81323,0.821674
Cabrera High School,0.974249,0.970954,0.968586,0.967803
Figueroa High School,0.812582,0.781382,0.819646,0.815421
Ford High School,0.79096,0.799697,0.782931,0.795918
Griffin High School,0.972906,0.975069,0.979452,0.96088
Hernandez High School,0.806846,0.817096,0.797441,0.811143
Holden High School,0.964912,0.961165,0.987952,0.944882
Huang High School,0.821382,0.805825,0.811966,0.812796
Johnson High School,0.814996,0.796327,0.819444,0.818571
Pena High School,0.956,0.949219,0.944751,0.981818


## Scores by School Spending

In [39]:
# create new column based on Per Student Budget bins
bins = [0, 585, 615, 645, 675]
labels = ['<$585', '$585-615', '$615-645', '$645-675']
spending_series = pd.cut(school_result['Per Student Budget'], bins, labels=labels)
school_result['Budget per Student Range'] = spending_series
budget_groups = school_result.groupby(by='Budget per Student Range')
budget_groups['Math Score', 'Reading Score', '% Passing Math', '% Passing Reading', '% Passing Overall'].mean()

Unnamed: 0_level_0,Math Score,Reading Score,% Passing Math,% Passing Reading,% Passing Overall
Budget per Student Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.455399,83.933814,0.934601,0.966109,0.903695
$585-615,83.599686,83.885211,0.942309,0.959003,0.902163
$615-645,79.079225,81.891436,0.756682,0.861066,0.661121
$645-675,76.99721,81.027843,0.661648,0.81134,0.535269


## Scores by School Size

In [33]:
# Create new column based on school size bins
bins = [0, 1000, 2000, 5000]
names = ['Small (< 1000)', 'Medium (1000 - 2000)', 'Large (2000 - 5000)']

size_series = pd.cut(school_result['Student ID'], bins, labels=names)
school_result['School Size'] = size_series
size_groups = school_result.groupby(by='School Size')
size_groups['Math Score', 'Reading Score', '% Passing Math', '% Passing Reading', '% Passing Overall'].mean()

Unnamed: 0_level_0,Math Score,Reading Score,% Passing Math,% Passing Reading,% Passing Overall
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (< 1000),83.821598,83.929843,0.935502,0.960994,0.898839
Medium (1000 - 2000),83.374684,83.864438,0.935997,0.967907,0.906215
Large (2000 - 5000),77.746417,81.344493,0.699634,0.827666,0.58286


## Scores by School Type

In [40]:
# Simple group by on Type (Charter/District)
size_groups = school_result.groupby(by='Type')
size_groups['Math Score', 'Reading Score', '% Passing Math', '% Passing Reading', '% Passing Overall'].mean()

Unnamed: 0_level_0,Math Score,Reading Score,% Passing Math,% Passing Reading,% Passing Overall
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,0.936208,0.965865,0.904322
District,76.956733,80.966636,0.665485,0.807991,0.536722
