In [2]:
# dependencies

import pandas as pd
import numpy as np
import csv

In [3]:
csv_schools_path = "Resources/schools_complete.csv"
csv_students_path ="Resources/students_complete.csv"

schools_complete_df = pd.read_csv(csv_schools_path)
students_complete_df = pd.read_csv(csv_students_path)
schools_complete_df.shape

(15, 5)

In [4]:
schools_complete_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
#Rename column header for consisency
schools_clean_df = schools_complete_df.rename(columns ={'School ID': 'school_id'})

In [6]:
students_complete_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39170 entries, 0 to 39169
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Student ID     39170 non-null  int64 
 1   student_name   39170 non-null  object
 2   gender         39170 non-null  object
 3   grade          39170 non-null  object
 4   school_name    39170 non-null  object
 5   reading_score  39170 non-null  int64 
 6   math_score     39170 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 2.1+ MB


In [7]:
#students csv info:
students_complete_df.head()


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [8]:
#Rename column header for consistency
students_clean_df = students_complete_df.rename(columns={'Student ID': 'student_id'})

In [9]:
#Merge both csv files
pycityschools_df = pd.merge(students_clean_df,schools_clean_df,on="school_name")
pycityschools_df.head()

Unnamed: 0,student_id,student_name,gender,grade,school_name,reading_score,math_score,school_id,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [10]:
#Calculate number of schools:
school_count = len(pycityschools_df.school_name.unique())

#Calculate number of students 
    #used student_id to ensure no duplication of names
student_count = len(pycityschools_df.student_id.unique())

#Calculate total budget using original school csv file:
total_budget = schools_clean_df.budget.sum(axis=0)

#Caculate ave scores for math and reading
ave_math_score = round(pycityschools_df.math_score.mean(),2)
ave_reading_score = round(pycityschools_df.reading_score.mean(),2)

In [52]:
#Pass/Fail determination for Math:
pycityschools_df['math_pass_fail'] = pycityschools_df['math_score'].apply(lambda x:'P' if x>=60 else 'F')

#Pass/Fail determination for Reading:
pycityschools_df['reading_pass_fail'] = pycityschools_df['reading_score'].apply(lambda x:'P' if x>=60 else 'F')
 
pycityschools_df.head(5)



Unnamed: 0,student_id,student_name,gender,grade,school_name,reading_score,math_score,school_id,type,size,budget,math_pass_fail,reading_pass_fail
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,P,P
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,P,P
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,P,P
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,F,P
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,P,P


In [67]:
#Overall Pass/Fail determination - must pass math & reading
overall_pass_df = pycityschools_df.set_index("math_pass_fail")
overall_pass_list = len(overall_pass_df.loc["P","reading_pass_fail"])

In [70]:
#Count of students passing math:
math_pass_list= len(pycityschools_df.loc[pycityschools_df["math_pass_fail"]=='P'])

#Count of students passing reading:
reading_pass_list = len(pycityschools_df.loc[pycityschools_df["reading_pass_fail"]=='P'])

#Count of overll passing - must pass both reading & math:
overall_pass_list = len(overall_pass_df.loc["P","reading_pass_fail"])


In [71]:
#Percentage calculations for each subject overall:
percentage_pass_math = round((math_pass_list/student_count*100),2)
percentage_pass_reading = round((reading_pass_list/student_count*100),2)
percentage_pass_overall = round((overall_pass_list/student_count*100),2)


In [79]:
#Key metrics table
district_data =[{'Total Schools':school_count,'Total Students': student_count,'Total Budget': total_budget, \
                 'Ave Math Score':ave_math_score,'Ave Reading Score':ave_reading_score,'% Passing Math':percentage_pass_math,\
                 '% Passing Reading':percentage_pass_reading,'% Overall Passing': percentage_pass_overall}]
data_summary_df=pd.DataFrame(district_data)
data_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Ave Math Score,Ave Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.99,81.88,92.45,100.0,92.45
