# PyCity Schools Analysis

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from functools import reduce

#set up paths for raw data
school_path = "Resources/schools_complete.csv"
student_path = "Resources/students_complete.csv"

#import raw data into data fromes
school_raw_df=pd.read_csv(school_path)
student_raw_df=pd.read_csv(student_path)

#merge the student and school dataframes by school name
merged_df=pd.merge(student_raw_df, school_raw_df, how='left', on=["school_name", "school_name"])

#add columns that output 1 if the student passed math/reading (score >= 70) and a 0 if they did not pass math/reading
merged_df['passed_math'] = np.where(merged_df["math_score"]>=70.0, 1, 0)
merged_df['passed_reading'] = np.where(merged_df["reading_score"]>=70.0, 1, 0)

merged_df.head()

#add format varibales that will be commonly used throughout exercise
percent_format="{0:,.2f}%".format
dollar_format="${0:,.2f}".format
number_format="{0:,}".format

## District Summary

In [2]:
#Calculate the total number of schools
total_schools_DS=len(school_raw_df["school_name"].unique())

#calculate total number of students
total_students_DS=student_raw_df["student_name"].count()

#calculate total budget
total_budget_DS=school_raw_df["budget"].sum()

#calculate the average math score
average_mathscore_DS=merged_df["math_score"].mean()

#calculate the average reading score
average_readingscore_DS=merged_df["reading_score"].mean()

#Calculate the percentage of students with a passing math score (70 or greater)
pass_math_DS=merged_df[merged_df["math_score"]>= 70.0].count()
percent_pass_math_DS=pass_math_DS["math_score"]/total_students_DS*100

#Calculate the percentage of students with a passing reading score (70 or greater)
pass_reading_DS=merged_df[merged_df["reading_score"]>= 70.0].count()
percent_pass_reading_DS=pass_reading_DS["reading_score"]/total_students_DS*100

#Calculate the overall passing rate (overall average score), i.e. (avg. math score + avg. reading score)/2
overall_passing_rate_DS=(average_mathscore_DS+average_readingscore_DS)/2

#Create a dataframe to hold the above results and apply formatting
district_summary_df=pd.DataFrame([[total_schools_DS,
                                   total_students_DS,
                                   total_budget_DS,
                                   average_mathscore_DS,
                                   average_readingscore_DS,
                                   percent_pass_math_DS,
                                   percent_pass_reading_DS,
                                   overall_passing_rate_DS]],
                                  index=[0],
                                  columns = ["Total Schools",
                                         "Total Students",
                                         "Total Budget",
                                         "Average Math Score",
                                         "Average Reading Score",
                                         "% Passing Math",
                                         "% Passing Reading",
                                         "% Overall Passing Rate"]
                                 )

district_summary_df["Total Budget"]=district_summary_df["Total Budget"].map(dollar_format)
district_summary_df["Total Students"]=district_summary_df["Total Students"].map(number_format)
district_summary_df.iloc[:,3:8]=district_summary_df.iloc[:,3:8].applymap(percent_format)
district_summary_df.head()

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.99%,81.88%,74.98%,85.81%,80.43%


## School Summary

In [3]:
#create a dictionary of functions that need to be performed on the merged_df to get school summary values:
agg_dict = {'type':np.unique,
            'student_name':np.size,
            'budget':np.unique,
            'math_score':np.mean,
            'reading_score':np.mean,
            'passed_math': np.sum,
            'passed_reading': np.sum,
           }

columns = {'math_score': 'Average Math Score',
          'reading_score': 'Average Reading Score',
          'type': 'School Type',
          'student_name': 'Total Students',
          'passed_math': 'Total Stu Math Pass',
          'passed_reading': 'Total Stu Reading Pass',
          'budget': 'School Budget'
          } 
SS_raw = merged_df.groupby(['school_name']).agg(agg_dict).rename(columns = columns)

#calculate budget per student by school name
SS_raw['Budget per Student']=SS_raw['School Budget']/SS_raw['Total Students']

#calculate percent of students passing math (score of 70 or greater) by school name
SS_raw['% Passing Math']=SS_raw['Total Stu Math Pass']/SS_raw['Total Students']*100

#calculate percent of students passing reading (score of 70 or greater) by school name
SS_raw['% Passing Reading']=SS_raw['Total Stu Reading Pass']/SS_raw['Total Students']*100

#calculate overall passing rate by school
SS_raw['Overall Passing Rate']=(SS_raw['% Passing Math'] + SS_raw['% Passing Reading'])/2

#remove the columns for number of students passing math and reading
SS_final=SS_raw.drop(columns=['Total Stu Math Pass', 'Total Stu Reading Pass'])

#reorder the columns
columnsSS=SS_final.columns.tolist()
order=[0,1,2,5,3,4,6,7,8]
columnsSS=[ columnsSS[i] for i in order]
SS_final=SS_final[columnsSS]

#Formatting
SS_final["School Budget"]=SS_final["School Budget"].map(dollar_format)
SS_final["Budget per Student"]=SS_final["Budget per Student"].map(dollar_format)
SS_final.iloc[:,4:9]=SS_final.iloc[:,4:9].applymap(percent_format)

SS_final.head()

Unnamed: 0_level_0,School Type,Total Students,School Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.05%,81.03%,66.68%,81.93%,74.31%
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.06%,83.98%,94.13%,97.04%,95.59%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.71%,81.16%,65.99%,80.74%,73.36%
Ford High School,District,2739,"$1,763,916.00",$644.00,77.10%,80.75%,68.31%,79.30%,73.80%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.35%,83.82%,93.39%,97.14%,95.27%


## Top Performing Schools (By Passing Rate)

Analysis: There is no correlation between high performing schools and school budget. However, the top performing schools were all either small (<1000) or medium schools (1000-3000 students)

In [4]:
SS_final_top=SS_final.sort_values(by='Overall Passing Rate', ascending=False)
SS_final_top.head()

Unnamed: 0_level_0,School Type,Total Students,School Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.06%,83.98%,94.13%,97.04%,95.59%
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.42%,83.85%,93.27%,97.31%,95.29%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.35%,83.82%,93.39%,97.14%,95.27%
Pena High School,Charter,962,"$585,858.00",$609.00,83.84%,84.04%,94.59%,95.95%,95.27%
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.27%,83.99%,93.87%,96.54%,95.20%


## Bottom Performing Schools (By Passing Rate)

Analysis: Bottom performing schools all had high budgets per student and were all big in size (either on the upper end of the medium size range (1000-3000) or in the large size range (3000-5000).

In [5]:
SS_final_bottom=SS_final.sort_values(by='Overall Passing Rate', ascending=True)
SS_final_bottom.head()

Unnamed: 0_level_0,School Type,Total Students,School Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.84%,80.74%,66.37%,80.22%,73.29%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.71%,81.16%,65.99%,80.74%,73.36%
Huang High School,District,2917,"$1,910,635.00",$655.00,76.63%,81.18%,65.68%,81.32%,73.50%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.07%,80.97%,66.06%,81.22%,73.64%
Ford High School,District,2739,"$1,763,916.00",$644.00,77.10%,80.75%,68.31%,79.30%,73.80%


## Math Scores by Grade

Analysis: math scores do not seem to improve from grade level to grade level at a given school. They are relatively consistent going from 9th to 10th to 11th to 12th grade.

In [6]:
# group merged data set by school name and grade
school_grade_group=merged_df.groupby(['school_name', 'grade'])

# get the average math score by school name and grade level
G_MS=school_grade_group['math_score'].mean()

# convert to dataframe
G_MS.to_frame()

#unstack the dataframe to turn the column values for grade level int0 column headers
G_MS_final=G_MS.unstack()

# rearrange the column headers to make "9th" grade come before "10th 11th 12th" grade
# put the column headers in a list 
cols=G_MS_final.columns.tolist()

# move the last column header in list to the first position in the list
cols=cols[-1:]+cols[:-1]

# reorder the dataframe as per column order set above
G_MS_final=G_MS_final[cols]

G_MS_final.head()


grade,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164


## Reading Score by Grade 

Analysis: reading scores do not seem to improve from grade level to grade level at a given school. They are relatively consistent going from 9th to 10th to 11th to 12th grade.

In [7]:
#pull the average reading scores by school and the 9th grade level and populate into dataframe
read9=merged_df[merged_df['grade']=='9th'].groupby('school_name').mean()
r9=read9["reading_score"]
r9=r9.to_frame('9th')

#pull the average reading scores by school and the 10th grade level and populate into dataframe
read10=merged_df[merged_df['grade']=='10th'].groupby('school_name').mean()
r10=read10["reading_score"]
r10=r10.to_frame('10th')

#pull the average reading scores by school and the 11th grade level and populate into dataframe
read11=merged_df[merged_df['grade']=='11th'].groupby('school_name').mean()
r11=read11["reading_score"]
r11=r11.to_frame('11th')

#pull the average reading scores by school and the 12th grade level and populate into dataframe
read12=merged_df[merged_df['grade']=='12th'].groupby('school_name').mean()
r12=read12["reading_score"]
r12=r12.to_frame('12th')

#Put your 9th, 10, 11th, 12th grade dataframes of average reading score by school name into a list of dataframes
dfs=[r9,r10,r11, r12]

#merge the dataframes on school name to create final table
G_RS_final = reduce(lambda left,right: pd.merge(left,right,on='school_name'), dfs)

G_RS_final.head()

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699


## Scores by School Spending

Analysis: Interestingly, the lower the budget per student of a school, the higher the overall passing rate.

In [8]:
#find min and max of Budget per Student to understand how to bin
print(SS_final["Budget per Student"].min())
print(SS_final["Budget per Student"].max())

$578.00
$655.00


In [9]:
# Sample bins.
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]

#cut data and create new columns based on bins
SS_raw["Spending Ranges (Per Student)"]=pd.cut(SS_raw["Budget per Student"], spending_bins, labels=group_names)

#create a dictionary of functions that need to be performed on the SS_Raw to get school spending summary values:
agg_dict_2 = {'Average Math Score':np.mean,
            'Average Reading Score':np.mean,
            'Total Stu Math Pass':np.sum,
            'Total Stu Reading Pass':np.sum,
             'Total Students':np.sum
             }

#group by school spending category and apply functions in agg_dict 2
school_spending_raw = SS_raw.groupby(['Spending Ranges (Per Student)']).agg(agg_dict_2)

#calculate %Passing Math, %Passing Reading, Overall % Passing
school_spending_raw["%Passing Math"]=school_spending_raw["Total Stu Math Pass"]/school_spending_raw["Total Students"]*100
school_spending_raw["%Passing Reading"]=school_spending_raw["Total Stu Reading Pass"]/school_spending_raw["Total Students"]*100
school_spending_raw["Overall Passing"]=(school_spending_raw["%Passing Reading"]+school_spending_raw["%Passing Math"])/2

#drop unneeded columns from final dataframe
school_spending_final=school_spending_raw.drop(columns=['Total Stu Math Pass', 'Total Stu Reading Pass', 'Total Students'])

#formatting of final table
school_spending_final.iloc[:,0:5]=school_spending_final.iloc[:,0:5].applymap(percent_format)

school_spending_final.head()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,%Passing Math,%Passing Reading,Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.46%,83.93%,93.70%,96.69%,95.19%
$585-615,83.60%,83.89%,94.12%,95.89%,95.01%
$615-645,79.08%,81.89%,71.40%,83.61%,77.51%
$645-675,77.00%,81.03%,66.23%,81.11%,73.67%


## Scores by School Size

Analysis: The smaller the size of the school the better the overall passing rate.

In [10]:
#find min and max of Budget per Student to understand how to bin
print(SS_final["Total Students"].min())
print(SS_final["Total Students"].max())

427
4976


In [11]:
# Sample bins.
size_bins = [0, 1000, 3000, 5000]
group_names2 = ["Small (<1000)", "Medium (1000-3000)", "Large (3000-5000)"]

#cut data and create new columns based on bins
SS_raw["School Size"]=pd.cut(SS_raw["Total Students"], size_bins, labels=group_names2)

#group by school size category and apply functions in agg_dict 2
school_size_raw = SS_raw.groupby(['School Size']).agg(agg_dict_2)

#calculate %Passing Math, %Passing Reading, Overall % Passing
school_size_raw["%Passing Math"]=school_size_raw["Total Stu Math Pass"]/school_size_raw["Total Students"]*100
school_size_raw["%Passing Reading"]=school_size_raw["Total Stu Reading Pass"]/school_size_raw["Total Students"]*100
school_size_raw["Overall Passing"]=(school_size_raw["%Passing Reading"]+school_size_raw["%Passing Math"])/2

#drop unneeded columns from final dataframe
school_size_final=school_size_raw.drop(columns=['Total Stu Math Pass', 'Total Stu Reading Pass', 'Total Students'])

#formatting of final table
school_size_final.iloc[:,0:5]=school_size_final.iloc[:,0:5].applymap(percent_format)

school_size_final.head()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,%Passing Math,%Passing Reading,Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.82%,83.93%,93.95%,96.04%,95.00%
Medium (1000-3000),81.18%,82.93%,81.68%,89.52%,85.60%
Large (3000-5000),77.06%,80.92%,66.47%,81.11%,73.79%


## Scores by School Type

Analsyis: Charter schools are better performing than district schools

In [12]:
#group by school type category and apply functions in agg_dict 2
school_type_raw = SS_raw.groupby(['School Type']).agg(agg_dict_2)

#calculate %Passing Math, %Passing Reading, Overall % Passing
school_type_raw["%Passing Math"]=school_type_raw["Total Stu Math Pass"]/school_type_raw["Total Students"]*100
school_type_raw["%Passing Reading"]=school_type_raw["Total Stu Reading Pass"]/school_type_raw["Total Students"]*100
school_type_raw["Overall Passing"]=(school_type_raw["%Passing Reading"]+school_type_raw["%Passing Math"])/2

#drop unneeded columns from final dataframe
school_type_final=school_type_raw.drop(columns=['Total Stu Math Pass', 'Total Stu Reading Pass', 'Total Students'])

#formatting of final table
school_type_final.iloc[:,0:5]=school_type_final.iloc[:,0:5].applymap(percent_format)

school_type_final.head()


Unnamed: 0_level_0,Average Math Score,Average Reading Score,%Passing Math,%Passing Reading,Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.47%,83.90%,93.70%,96.65%,95.17%
District,76.96%,80.97%,66.52%,80.91%,73.71%
