In [1]:
# dependencies
import pandas as pd
import numpy as np
import csv

In [2]:
csv_schools_path = "Resources/schools_complete.csv"
csv_students_path ="Resources/students_complete.csv"

schools_complete_df = pd.read_csv(csv_schools_path)
students_complete_df = pd.read_csv(csv_students_path)
schools_complete_df.shape

(15, 5)

In [3]:
#clean up school dataframe:
schools_complete_df=schools_complete_df.drop('School ID',axis=1)
schools_clean_df = schools_complete_df.sort_values(by=['school_name'])
schools_clean_df = schools_clean_df.set_index('school_name')

#Rename column header for consistency
students_clean_df = students_complete_df.rename(columns={'Student ID': 'student_id'})

In [4]:
#Merge both csv files
pycityschools_df = pd.merge(schools_clean_df,students_clean_df,on="school_name")
pycityschools_df.head()

Unnamed: 0,school_name,type,size,budget,student_id,student_name,gender,grade,reading_score,math_score
0,Bailey High School,District,4976,3124928,17871,Blake Martin,M,9th,75,59
1,Bailey High School,District,4976,3124928,17872,Kathryn Kane,F,12th,84,58
2,Bailey High School,District,4976,3124928,17873,Richard Haas,M,11th,79,86
3,Bailey High School,District,4976,3124928,17874,Frank Marsh,M,9th,71,89
4,Bailey High School,District,4976,3124928,17875,Charles Goodman Jr.,M,9th,90,61


In [5]:
#Calculate number of schools:
school_count = len(pycityschools_df.school_name.unique())

#Calculate number of students 
    #used student_id to ensure no duplication of names
student_count = len(pycityschools_df.student_id.unique())


#Calculate total budget using original school csv file:
#school_budget=schools_clean_df.budget.unique()
total_budget = schools_clean_df.budget.sum(axis=0)

#Caculate ave scores for math and reading
ave_math_score = round(pycityschools_df.math_score.mean(),2)
ave_reading_score = round(pycityschools_df.reading_score.mean(),2)


In [6]:
#Pass/Fail determination for Math:
#pycityschools_df['math_pass_fail'] = pycityschools_df['math_score'].apply(lambda x:'P' if x>=70 else 'F')

#Pass/Fail determination for Reading:
#pycityschools_df['reading_pass_fail'] = pycityschools_df['reading_score'].apply(lambda x:'P' if x>=70 else 'F')

#Overall Pass/Fail determination - must pass math & reading
#overall_pass_df = pycityschools_df.set_index("math_pass_fail")
#overall_pass_list = len(overall_pass_df.loc["P","reading_pass_fail"])

#Count of students passing math:
math_pass_list= pycityschools_df.loc[pycityschools_df['math_score']>=70]['math_score'].count()
percentage_pass_math = round((math_pass_list/student_count*100),2)

#Count of students passing reading:
reading_pass_list = pycityschools_df.loc[pycityschools_df['reading_score']>=70]['reading_score'].count()
percentage_pass_reading = round((reading_pass_list/student_count*100),2)

#Count of overll passing - must pass both reading & math:
overall_pass_list = pycityschools_df[(pycityschools_df['math_score']>=70) & (pycityschools_df['reading_score']>=70)]['student_id'].count()
percentage_pass_overall = round((overall_pass_list/student_count*100),2)
overall_pass_list

25528

In [7]:
#Key metrics table - district level
district_data =[{'Total Schools':school_count,
                 'Total Students': student_count,
                 'Total Budget': total_budget, 
                 'Ave Math Score':ave_math_score,
                 'Ave Reading Score':ave_reading_score,
                 '% Passing Math':percentage_pass_math,
                 '% Passing Reading':percentage_pass_reading,
                 '% Overall Passing': percentage_pass_overall}]
data_district_summary_df=pd.DataFrame(district_data)
data_district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.99,81.88,74.98,85.81,65.17


In [20]:
school_data_df = pycityschools_df.set_index('school_name').groupby(['school_name'])
school_data_df.head()

Unnamed: 0_level_0,type,size,budget,student_id,student_name,gender,grade,reading_score,math_score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,17871,Blake Martin,M,9th,75,59
Bailey High School,District,4976,3124928,17872,Kathryn Kane,F,12th,84,58
Bailey High School,District,4976,3124928,17873,Richard Haas,M,11th,79,86
Bailey High School,District,4976,3124928,17874,Frank Marsh,M,9th,71,89
Bailey High School,District,4976,3124928,17875,Charles Goodman Jr.,M,9th,90,61
...,...,...,...,...,...,...,...,...,...
Wright High School,Charter,1800,1049400,24236,Aaron Johnson,M,10th,89,72
Wright High School,Charter,1800,1049400,24237,Kimberly Hamilton,F,10th,84,93
Wright High School,Charter,1800,1049400,24238,Ashley Johns,F,10th,88,88
Wright High School,Charter,1800,1049400,24239,Stephanie Donovan,F,10th,75,84


In [26]:
#School level dataset:
school_type=school_data_df['type'].first()
school_student=school_data_df['student_id'].count()
school_budget=school_data_df['budget'].first()
student_budget=school_budget/school_student
school_ave_math=round(school_data_df['math_score'].mean(),2)
school_ave_reading=round(school_data_df['reading_score'].mean(),2)
school_ave_reading

school_name
Bailey High School       81.03
Cabrera High School      83.98
Figueroa High School     81.16
Ford High School         80.75
Griffin High School      83.82
Hernandez High School    80.93
Holden High School       83.81
Huang High School        81.18
Johnson High School      80.97
Pena High School         84.04
Rodriguez High School    80.74
Shelton High School      83.73
Thomas High School       83.85
Wilson High School       83.99
Wright High School       83.96
Name: reading_score, dtype: float64

In [46]:
#School level dataset:
#school_math_count= pycityschools_df[(pycityschools_df['math_score'])>=70].groupby('school_name')['student_id'].count()
school_math_perc=round((pycityschools_df[pycityschools_df['math_score']>=70].groupby('school_name')['student_id'].count())/(school_student)*100,2)
school_reading_perc=round((pycityschools_df[pycityschools_df['reading_score']>=70].groupby('school_name')['student_id'].count())/(school_student)*100,2)

overall_pass_count=pycityschools_df[(pycityschools_df['math_score']>=70) & (pycityschools_df['reading_score']>=70)].groupby('school_name')['student_id'].count()
overall_pass_perc=round((overall_pass_count/school_student)*100,2)
overall_pass_perc



school_name
Bailey High School       54.64
Cabrera High School      91.33
Figueroa High School     53.20
Ford High School         54.29
Griffin High School      90.60
Hernandez High School    53.53
Holden High School       89.23
Huang High School        53.51
Johnson High School      53.54
Pena High School         90.54
Rodriguez High School    52.99
Shelton High School      89.89
Thomas High School       90.95
Wilson High School       90.58
Wright High School       90.33
Name: student_id, dtype: float64

In [None]:
#School Summary 
school_summary_data=[{'School Type':school_type,
                     'Total Students': school_student,
                     'Total School Budget': school_budget,
                     'Per Student Budget': student_budget,
                     'Ave Math Score': school_ave_math,
                     'Ave Reading Score': school_ave_reading,
                     '% Passing Math': school_math_perc,
                     '% Passing Reading': school_reading_perc,
                     '% Overall Passing': overall_pass_school}]
school_summary_df=pd.DataFrame(school_summary_data)
school_summary_df

