In [1]:
# dependencies
import pandas as pd
import numpy as np
import csv

In [2]:
csv_schools_path = "Resources/schools_complete.csv"
csv_students_path ="Resources/students_complete.csv"

schools_complete_df = pd.read_csv(csv_schools_path)
students_complete_df = pd.read_csv(csv_students_path)
schools_complete_df.shape

(15, 5)

In [3]:
#clean up school dataframe:
schools_complete_df=schools_complete_df.drop('School ID',axis=1)
schools_clean_df = schools_complete_df.sort_values(by=['school_name'])
schools_clean_df = schools_clean_df.set_index('school_name')

#Rename column header for consistency
students_clean_df = students_complete_df.rename(columns={'Student ID': 'student_id'})

In [4]:
#Merge both csv files
pycityschools_df = pd.merge(schools_clean_df,students_clean_df,on="school_name")


In [5]:
#Calculate number of schools:
school_count = len(pycityschools_df.school_name.unique())

#Calculate number of students 
    #used student_id to ensure no duplication of names
student_count = len(pycityschools_df.student_id.unique())


#Calculate total budget using original school csv file:
#school_budget=schools_clean_df.budget.unique()
total_budget = schools_clean_df.budget.sum(axis=0)

#Caculate ave scores for math and reading
ave_math_score = round(pycityschools_df.math_score.mean(),2)
ave_reading_score = round(pycityschools_df.reading_score.mean(),2)

In [7]:
#Pass/Fail determination for Math:

#Count of students passing math:
math_pass_list= pycityschools_df.loc[pycityschools_df['math_score']>=70]['math_score'].count()
percentage_pass_math = round((math_pass_list/student_count*100),2)

#Count of students passing reading:
reading_pass_list = pycityschools_df.loc[pycityschools_df['reading_score']>=70]['reading_score'].count()
percentage_pass_reading = round((reading_pass_list/student_count*100),2)

#Count of overll passing - must pass both reading & math:
overall_pass_list = pycityschools_df[(pycityschools_df['math_score']>=70) & (pycityschools_df['reading_score']>=70)]['student_id'].count()
percentage_pass_overall = round((overall_pass_list/student_count*100),2)


In [8]:
#Key metrics table - district level
district_data =[{'Total Schools':school_count,
                 'Total Students': student_count,
                 'Total Budget': total_budget, 
                 'Ave Math Score':ave_math_score,
                 'Ave Reading Score':ave_reading_score,
                 '% Passing Math':percentage_pass_math,
                 '% Passing Reading':percentage_pass_reading,
                 '% Overall Passing': percentage_pass_overall}]
data_district_summary_df=pd.DataFrame(district_data)
data_district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.99,81.88,74.98,85.81,65.17


In [9]:
school_data_df = pycityschools_df.set_index('school_name').groupby(['school_name'])


In [11]:
#School Summary:

#Create dataframe showing type of school:
school_type=school_data_df['type'].first()
school_type_df=pd.DataFrame(school_type)

#Create dataframe showing students/sschool:
school_student=school_data_df['student_id'].count()
school_student_df=pd.DataFrame(school_student)

#Merge first 2 dataframes together:
school_summary_df=pd.merge(school_type,school_student_df,how='left',on='school_name')

#Create dataframe showing school budget:
school_budget=school_data_df['budget'].first()
school_budget_df=pd.DataFrame(school_budget)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,school_budget_df,how='left',on='school_name')

#Calculate budget per student per school & create dataframe:
student_budget=school_budget/school_student
student_budget_df=pd.DataFrame(student_budget)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,student_budget_df,how='left',on='school_name')

#Calculate ave math score per school & create dataframe:
school_ave_math=round(school_data_df['math_score'].mean(),2)
school_ave_math_df=pd.DataFrame(school_ave_math)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,school_ave_math_df,how='left',on='school_name')

#Calculate ave reading score per school & create dataframe:
school_ave_reading=round(school_data_df['reading_score'].mean(),2)
school_ave_reading_df=pd.DataFrame(school_ave_reading)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,school_ave_reading_df,how='left',on='school_name')

#Calculate % of students passing math & create a dataframe:
school_math_perc=round((pycityschools_df[pycityschools_df['math_score']>=70].groupby('school_name')['student_id'].count())/(school_student)*100,2)
school_math_perc_df=pd.DataFrame(school_math_perc)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,school_math_perc_df,how='left',on='school_name')

#Calculate % of students passing reading & create a dataframe:
school_reading_perc=round((pycityschools_df[pycityschools_df['reading_score']>=70].groupby('school_name')['student_id'].count())/(school_student)*100,2)
school_reading_perc_df=pd.DataFrame(school_reading_perc)

#Merge dataframe with prior merge:
school_summary_df=pd.merge(school_summary_df,school_reading_perc_df,how='left',on='school_name')

#Calculate % of students passing overall & create a dataframe:
overall_pass_count=pycityschools_df[(pycityschools_df['math_score']>=70) & (pycityschools_df['reading_score']>=70)].groupby('school_name')['student_id'].count()
overall_pass_perc=round((overall_pass_count/school_student)*100,2)
overall_pass_perc_df=pd.DataFrame(overall_pass_perc)

#Final merge:
school_summary_df=pd.merge(school_summary_df,overall_pass_perc_df,how='left',on='school_name')

#Clean up display
school_summary_df.columns=['School Type','Total Students','Total School Budget','Per Student Budget','Ave Math Score','Ave Reading Score','% Passing Math', '% Passing Reading', '% Overall Passing']
school_summary_df.rename_axis('School Name', inplace = True)

school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628.0,77.05,81.03,66.68,81.93,54.64
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,94.13,97.04,91.33
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,65.99,80.74,53.2
Ford High School,District,2739,1763916,644.0,77.1,80.75,68.31,79.3,54.29
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,66.75,80.86,53.53
Holden High School,Charter,427,248087,581.0,83.8,83.81,92.51,96.25,89.23
Huang High School,District,2917,1910635,655.0,76.63,81.18,65.68,81.32,53.51
Johnson High School,District,4761,3094650,650.0,77.07,80.97,66.06,81.22,53.54
Pena High School,Charter,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


In [13]:
#Top 5 Performing Schools:
school_summary_df.sort_values(by='% Overall Passing',ascending=False).head(5)


Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,94.13,97.04,91.33
Thomas High School,Charter,1635,1043130,638.0,83.42,83.85,93.27,97.31,90.95
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Wilson High School,Charter,2283,1319574,578.0,83.27,83.99,93.87,96.54,90.58
Pena High School,Charter,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


In [14]:
#Bottom 5 Performing Schools:
school_summary_df.sort_values(by='% Overall Passing',ascending=True).head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,76.84,80.74,66.37,80.22,52.99
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,65.99,80.74,53.2
Huang High School,District,2917,1910635,655.0,76.63,81.18,65.68,81.32,53.51
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,66.75,80.86,53.53
Johnson High School,District,4761,3094650,650.0,77.07,80.97,66.06,81.22,53.54


In [26]:
pycityschools_df

Unnamed: 0,school_name,type,size,budget,student_id,student_name,gender,grade,reading_score,math_score
0,Bailey High School,District,4976,3124928,17871,Blake Martin,M,9th,75,59
1,Bailey High School,District,4976,3124928,17872,Kathryn Kane,F,12th,84,58
2,Bailey High School,District,4976,3124928,17873,Richard Haas,M,11th,79,86
3,Bailey High School,District,4976,3124928,17874,Frank Marsh,M,9th,71,89
4,Bailey High School,District,4976,3124928,17875,Charles Goodman Jr.,M,9th,90,61
...,...,...,...,...,...,...,...,...,...,...
39165,Wright High School,Charter,1800,1049400,26031,Lauren Duncan,F,12th,90,82
39166,Wright High School,Charter,1800,1049400,26032,Heather Stuart,F,11th,94,95
39167,Wright High School,Charter,1800,1049400,26033,Elizabeth Robbins,F,12th,69,99
39168,Wright High School,Charter,1800,1049400,26034,John Mann,M,11th,73,91


In [33]:
#Re-index dataframe and group by grade level:
scores_by_grade_df = pycityschools_df.set_index('grade').groupby(['grade'])

In [38]:
#Math scores by Grade Level:
math_scores_grade = scores_by_grade_df['math_score'].mean()
math_scores_grade_df=pd.DataFrame(math_scores_grade)
math_scores_grade_df

Unnamed: 0_level_0,math_score
grade,Unnamed: 1_level_1
10th,78.941483
11th,79.083548
12th,78.993164
9th,78.935659


In [40]:
#Reading scores by Grade Level:
reading_scores_grade = scores_by_grade_df['reading_score'].mean()
reading_scores_grade_df=pd.DataFrame(reading_scores_grade)
reading_scores_grade_df

Unnamed: 0_level_0,reading_score
grade,Unnamed: 1_level_1
10th,81.87441
11th,81.885714
12th,81.819851
9th,81.914358
