In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# File to Load (Remember to Change These)
school_data_to_load = Path("Resources/schools_complete.csv")
student_data_to_load = Path("Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete_df = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,School ID,type,size,budget
0,0,Paul Bradley,M,9,Huang High School,96,94,0,Government,2917,1910635
1,1,Victor Smith,M,12,Huang High School,90,43,0,Government,2917,1910635
2,2,Kevin Rodriguez,M,12,Huang High School,41,76,0,Government,2917,1910635
3,3,Richard Scott,M,12,Huang High School,89,86,0,Government,2917,1910635
4,4,Bonnie Ray,F,9,Huang High School,87,69,0,Government,2917,1910635


In [3]:
total_students = len(school_data_complete_df["student_name"])
unique_school = school_data_complete_df["school_name"].nunique()
school_type = school_data_complete_df["type"].unique()
total_budget = int(school_data_complete_df["budget"].unique().sum())


print(f"\nTotal number of students are: {total_students}\n")
print(f"Total number of unique school: {unique_school}\n")
print(f"The total budget is: $ {total_budget}\n")
print(f"There are {len(school_type)} types of school. They are:")
for school in school_type:
    print(school)





Total number of students are: 39170

Total number of unique school: 15

The total budget is: $ 24649428

There are 2 types of school. They are:
Government
Independent


In [4]:
#find average math score
total_math_score = school_data_complete_df["maths_score"].sum()
average_math_score = total_math_score / total_students
format_average_math_score = "{:.2f}".format(average_math_score)

print(f"The average math score is: {format_average_math_score}")

The average math score is: 70.34


In [5]:
#finding average reading score
total_reading_score = school_data_complete_df["reading_score"].sum()
average_reading_score = total_reading_score / total_students
format_average_reading_score = "{:.2f}".format(average_reading_score)

print(f"The average reading score is: {format_average_reading_score}")

The average reading score is: 69.98


In [6]:
# % students passing math (passing grade >= 50)
math_passing_grade = school_data_complete_df.loc[school_data_complete_df["maths_score"] >= 50] 
number_students_passing_math = len(math_passing_grade)

average_passing_math = number_students_passing_math / total_students * 100
format_passing_math = "{:.2f}".format(average_passing_math)
print(format_passing_math + "%")

86.08%


In [7]:
# % students passing reading (passing grade >= 50)
reading_passing_grade = school_data_complete_df.loc[school_data_complete_df["reading_score"] >= 50] 
number_students_passing_reading = len(reading_passing_grade)

average_passing_reading = number_students_passing_reading / total_students * 100
format_passing_reading = "{:.2f}".format(average_passing_reading)
print(format_passing_reading + "%")

84.43%


In [8]:
#% overall passing both subjects
#create a condition to find how mamy students pass both subjets
#in this case, math and reading >= 50 is considered True
#if either one is < 50, it returns False
#create a temperary column to store the boolean result into the DataFrame
school_data_complete_df["Overall Passing Rate"] = (school_data_complete_df["reading_score"]>=50)&(
    school_data_complete_df["maths_score"]>= 50)

#find the total number of students who passed both subjects
#boolean value == True
pass_both_subjects = len(school_data_complete_df[school_data_complete_df["Overall Passing Rate"] == True])
percent_passing_both_subjects = pass_both_subjects / total_students * 100
format_percent_passing_both_subjects = "{:.2f}".format(percent_passing_both_subjects)


print(f"Total number of students that pass both subjects: {pass_both_subjects}")
print(f"The overall percentage of students passing both subjects: {format_percent_passing_both_subjects}%")


Total number of students that pass both subjects: 28519
The overall percentage of students passing both subjects: 72.81%


In [9]:
#create a summary table

LGA_summary_df = pd.DataFrame({
    "Total Students": [total_students],
    "Total Budget": [total_budget],
    "Average Math Score": [format_average_math_score],
    "Average Reading Score": [format_average_reading_score],
    "% Passing Math": [format_passing_math],
    "% Passing Reading": [format_passing_reading],
    "% Overall Passing Rate": [format_percent_passing_both_subjects]
})

LGA_summary_df

Unnamed: 0,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,39170,24649428,70.34,69.98,86.08,84.43,72.81


In [10]:
#how="inner" returns only the intersection of the two DataFrames
school_summary = pd.merge(student_data_df, school_data_df, how="inner", on=["school_name", "school_name"])

#drop unnecessary columns from DataFrame 
# .drop(["column name"], axis=1 specifies which column to be removed
school_summary_new_metric = school_summary.drop(["Student ID", "gender", "year", "School ID"], axis=1)
school_summary_new_metric

Unnamed: 0,student_name,school_name,reading_score,maths_score,type,size,budget
0,Paul Bradley,Huang High School,96,94,Government,2917,1910635
1,Victor Smith,Huang High School,90,43,Government,2917,1910635
2,Kevin Rodriguez,Huang High School,41,76,Government,2917,1910635
3,Richard Scott,Huang High School,89,86,Government,2917,1910635
4,Bonnie Ray,Huang High School,87,69,Government,2917,1910635
...,...,...,...,...,...,...,...
39165,Donna Howard,Thomas High School,51,48,Independent,1635,1043130
39166,Dawn Bell,Thomas High School,81,89,Independent,1635,1043130
39167,Rebecca Tanner,Thomas High School,99,99,Independent,1635,1043130
39168,Desiree Kidd,Thomas High School,72,77,Independent,1635,1043130


In [11]:
#finding school name and school types
unqiue_school_type_name = school_summary_new_metric.groupby("school_name")["type"].unique()

#convert this into a dataframe with column and row; 
# index is the school name and values are the type of school
schools = pd.DataFrame(
    {"school_name": unqiue_school_type_name.index,
     "type" : unqiue_school_type_name.values
})

schools = schools.rename(columns={"type" : "School Type", "schools_name": "School Name"})
schools

Unnamed: 0,school_name,School Type
0,Bailey High School,[Government]
1,Cabrera High School,[Independent]
2,Figueroa High School,[Government]
3,Ford High School,[Government]
4,Griffin High School,[Independent]
5,Hernandez High School,[Government]
6,Holden High School,[Independent]
7,Huang High School,[Government]
8,Johnson High School,[Government]
9,Pena High School,[Independent]


In [12]:
#finding total number of students for each school
school_name_count = school_summary_new_metric.groupby("school_name")["student_name"].count().astype(int)
# print(school_name_count)

#convert this into a dataframe with column and row; 
# index is the school name and values are the total students for that school
school_name_count = pd.DataFrame({
    "school_name" : school_name_count.index,
    "student_name" : school_name_count.values
})

total_school_students = school_name_count.rename(columns={"student_name" : "Total Students"})
total_school_students

Unnamed: 0,school_name,Total Students
0,Bailey High School,4976
1,Cabrera High School,1858
2,Figueroa High School,2949
3,Ford High School,2739
4,Griffin High School,1468
5,Hernandez High School,4635
6,Holden High School,427
7,Huang High School,2917
8,Johnson High School,4761
9,Pena High School,962


In [13]:
#finding budget for each school
school_budget = school_summary_new_metric.groupby("school_name")["budget"].unique()

#convert this into a dataframe with column and row; 
# index is the school name and values are the school budget
school_budget = pd.DataFrame({
    "school_name" : school_budget.index,
    "budget" : school_budget.values 
})

school_budget["budget"] = school_budget["budget"].str.get(0)
school_budget = school_budget.rename(columns={"budget" : "Total School Budget"})
school_budget

Unnamed: 0,school_name,Total School Budget
0,Bailey High School,3124928
1,Cabrera High School,1081356
2,Figueroa High School,1884411
3,Ford High School,1763916
4,Griffin High School,917500
5,Hernandez High School,3022020
6,Holden High School,248087
7,Huang High School,1910635
8,Johnson High School,3094650
9,Pena High School,585858


In [14]:
#finding per student budget
#merge the dataframe of school budget and school name and total students
# [school_budget] [schools] [total_school_students] 
#can only merge 2 dataframe at a time

merge_schools_total_school_students = pd.merge(schools, total_school_students, 
                                               how="inner", on=["school_name" , "school_name"])
merge_schools_total_school_students_school_budget = pd.merge(merge_schools_total_school_students,
                                                             school_budget, how="inner", 
                                                             on=["school_name", "school_name"])

merge_school_budget = merge_schools_total_school_students_school_budget 


merge_school_budget = merge_school_budget.rename(columns={"Total School Budget_x" : "Total School Budget"})
merge_school_budget = merge_school_budget.rename(columns={"Total School Budget_y" : "Total Students"})
merge_school_budget["Per Student Budget"] = merge_school_budget["Total School Budget"]/merge_school_budget["Total Students"]
merge_school_budget





Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget
0,Bailey High School,[Government],4976,3124928,628.0
1,Cabrera High School,[Independent],1858,1081356,582.0
2,Figueroa High School,[Government],2949,1884411,639.0
3,Ford High School,[Government],2739,1763916,644.0
4,Griffin High School,[Independent],1468,917500,625.0
5,Hernandez High School,[Government],4635,3022020,652.0
6,Holden High School,[Independent],427,248087,581.0
7,Huang High School,[Government],2917,1910635,655.0
8,Johnson High School,[Government],4761,3094650,650.0
9,Pena High School,[Independent],962,585858,609.0


In [15]:
#average math score for each schools
schools_ave_math_score = school_summary_new_metric.groupby("school_name")["maths_score"].mean().round(2)

#convert this into a dataframe with column and row; 
# index is the school name and values are the average no. of students passing math
schools_ave_math_score = pd.DataFrame({
    "school_name" : schools_ave_math_score.index,
    "Average Math Score" : schools_ave_math_score.values
})

schools_ave_math_score
merge_school_budget_math_score = pd.merge(merge_school_budget, schools_ave_math_score, how="inner", on=["school_name", "school_name"])
merge_school_budget_math_score


Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score
0,Bailey High School,[Government],4976,3124928,628.0,72.35
1,Cabrera High School,[Independent],1858,1081356,582.0,71.66
2,Figueroa High School,[Government],2949,1884411,639.0,68.7
3,Ford High School,[Government],2739,1763916,644.0,69.09
4,Griffin High School,[Independent],1468,917500,625.0,71.79
5,Hernandez High School,[Government],4635,3022020,652.0,68.87
6,Holden High School,[Independent],427,248087,581.0,72.58
7,Huang High School,[Government],2917,1910635,655.0,68.94
8,Johnson High School,[Government],4761,3094650,650.0,68.84
9,Pena High School,[Independent],962,585858,609.0,72.09


In [16]:
#average reading score for each schools
schools_ave_reading_score = school_summary_new_metric.groupby("school_name")["reading_score"].mean().round(2)

#convert this into a dataframe with column and row; 
# index is the school name and values are the average no. of students passing reading
schools_ave_reading_score = pd.DataFrame({
    "school_name" : schools_ave_reading_score.index,
    "Average Reading Score" : schools_ave_reading_score.values
})

schools_ave_reading_score
merge_school_budget_math_reading_score = pd.merge(merge_school_budget_math_score, schools_ave_reading_score, 
                                                  how="inner", on=["school_name", "school_name"])

merged = merge_school_budget_math_reading_score
merged = merged.rename(columns={"Average Math Score_x" : "Average Math Score"})
merged = merged.rename(columns={"Average Reading Score_y" : "Average Reading Score"})
merged


Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score
0,Bailey High School,[Government],4976,3124928,628.0,72.35,71.01
1,Cabrera High School,[Independent],1858,1081356,582.0,71.66,71.36
2,Figueroa High School,[Government],2949,1884411,639.0,68.7,69.08
3,Ford High School,[Government],2739,1763916,644.0,69.09,69.57
4,Griffin High School,[Independent],1468,917500,625.0,71.79,71.25
5,Hernandez High School,[Government],4635,3022020,652.0,68.87,69.19
6,Holden High School,[Independent],427,248087,581.0,72.58,71.66
7,Huang High School,[Government],2917,1910635,655.0,68.94,68.91
8,Johnson High School,[Government],4761,3094650,650.0,68.84,69.04
9,Pena High School,[Independent],962,585858,609.0,72.09,71.61


In [17]:
#finding % passing math for each school
#use .loc to filter the dataframe
math_passing_schools = school_summary_new_metric.loc[school_summary_new_metric["maths_score"] >= 50]
math_passing_schools = math_passing_schools.groupby("school_name")["student_name"].count()

#convert this into a dataframe with column and row; 
# index is the school name and values are the number of students passing math
math_passing_schools = pd.DataFrame({
    "school_name" : math_passing_schools.index,
    "Total Students Passing Math" : math_passing_schools.values
})

# #merge this dataframe with merged dataframe to get Total Students column 
# #this allows to calculate % passing math for each school

merge_math_passing_schools = pd.merge(merged, math_passing_schools, how="inner", on=["school_name", "school_name"])
merge_math_passing_schools["% Passing Maths"] = (merge_math_passing_schools["Total Students Passing Math"]/
                              merge_math_passing_schools["Total Students"])*100

#remove ["Total Students Passing Math"] column from DataFrame
merged_math_scores = merge_math_passing_schools.drop(["Total Students Passing Math"],axis=1)
merged_math_scores



Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths
0,Bailey High School,[Government],4976,3124928,628.0,72.35,71.01,91.639871
1,Cabrera High School,[Independent],1858,1081356,582.0,71.66,71.36,90.850377
2,Figueroa High School,[Government],2949,1884411,639.0,68.7,69.08,81.654798
3,Ford High School,[Government],2739,1763916,644.0,69.09,69.57,82.438846
4,Griffin High School,[Independent],1468,917500,625.0,71.79,71.25,91.212534
5,Hernandez High School,[Government],4635,3022020,652.0,68.87,69.19,80.949299
6,Holden High School,[Independent],427,248087,581.0,72.58,71.66,89.929742
7,Huang High School,[Government],2917,1910635,655.0,68.94,68.91,81.693521
8,Johnson High School,[Government],4761,3094650,650.0,68.84,69.04,82.062592
9,Pena High School,[Independent],962,585858,609.0,72.09,71.61,91.683992


In [18]:
#finding % passing reading for each school
#use .loc to filter the dataframe
reading_passing_schools_summary = school_summary_new_metric.loc[school_summary_new_metric["reading_score"] >= 50]
reading_passing_schools_summary = reading_passing_schools_summary.groupby("school_name")["student_name"].count()
reading_passing_schools_summary = pd.DataFrame({
    "school_name" : reading_passing_schools_summary.index,
    "Total Students Passing Reading" : reading_passing_schools_summary.values
})

#merge the dataframe 
# #create a column titled ["% Passing Reading"] in the dataframe
merge_reading_passing_schools = pd.merge(merged_math_scores, reading_passing_schools_summary, how="inner", on=["school_name", "school_name"])
merge_reading_passing_schools["% Passing Reading"] = (merge_reading_passing_schools["Total Students Passing Reading"]/
                              merge_reading_passing_schools["Total Students"])*100

merged_math_reading_summary = merge_reading_passing_schools.drop(["Total Students Passing Reading"], axis=1)
merged_math_reading_summary



Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading
0,Bailey High School,[Government],4976,3124928,628.0,72.35,71.01,91.639871,87.379421
1,Cabrera High School,[Independent],1858,1081356,582.0,71.66,71.36,90.850377,89.074273
2,Figueroa High School,[Government],2949,1884411,639.0,68.7,69.08,81.654798,82.807731
3,Ford High School,[Government],2739,1763916,644.0,69.09,69.57,82.438846,82.219788
4,Griffin High School,[Independent],1468,917500,625.0,71.79,71.25,91.212534,88.487738
5,Hernandez High School,[Government],4635,3022020,652.0,68.87,69.19,80.949299,81.877023
6,Holden High School,[Independent],427,248087,581.0,72.58,71.66,89.929742,88.52459
7,Huang High School,[Government],2917,1910635,655.0,68.94,68.91,81.693521,81.453548
8,Johnson High School,[Government],4761,3094650,650.0,68.84,69.04,82.062592,81.978576
9,Pena High School,[Independent],962,585858,609.0,72.09,71.61,91.683992,86.590437


In [19]:
#finding overall passing rate for both subjects of students for each school
#use & to create a boolean condition for both math and reading
school_summary_new_metric["Overall Passing"] = (school_summary_new_metric["maths_score"]>= 50)&(school_summary_new_metric["reading_score"] >= 50)
overall_passing_both_subjects = school_summary_new_metric.loc[school_summary_new_metric["Overall Passing"]==True]
overall_pass_summary = overall_passing_both_subjects.groupby("school_name")["Overall Passing"].count()

#convert this into a dataframe with column and row; 
# index is the school name and values are the number of students passing math
overall_pass_summary = pd.DataFrame({
    "school_name" : overall_pass_summary.index,
    "Students Overall Passing" : overall_pass_summary.values
})

#merge this overall to existing dataframe
merged_summary_overall = pd.merge(merged_math_reading_summary, overall_pass_summary, how="inner", on=["school_name", "school_name"])

merged_summary_overall["% Overall Passing"] = (merged_summary_overall["Students Overall Passing"]/
                              merged_summary_overall["Total Students"])*100

merged_summary_overall = merged_summary_overall.drop(["Students Overall Passing"], axis=1)
merged_summary_overall


Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,Bailey High School,[Government],4976,3124928,628.0,72.35,71.01,91.639871,87.379421,80.084405
1,Cabrera High School,[Independent],1858,1081356,582.0,71.66,71.36,90.850377,89.074273,80.785791
2,Figueroa High School,[Government],2949,1884411,639.0,68.7,69.08,81.654798,82.807731,67.650051
3,Ford High School,[Government],2739,1763916,644.0,69.09,69.57,82.438846,82.219788,67.46988
4,Griffin High School,[Independent],1468,917500,625.0,71.79,71.25,91.212534,88.487738,81.33515
5,Hernandez High School,[Government],4635,3022020,652.0,68.87,69.19,80.949299,81.877023,66.364617
6,Holden High School,[Independent],427,248087,581.0,72.58,71.66,89.929742,88.52459,78.922717
7,Huang High School,[Government],2917,1910635,655.0,68.94,68.91,81.693521,81.453548,66.712376
8,Johnson High School,[Government],4761,3094650,650.0,68.84,69.04,82.062592,81.978576,67.191766
9,Pena High School,[Independent],962,585858,609.0,72.09,71.61,91.683992,86.590437,79.209979


In [20]:
#clean up the DataFrame
final_school_summary = merged_summary_overall.set_index('school_name')

#formatting columns to be more readable
#removing unnecessary characters
#removing index characters
final_school_summary["School Type"] = final_school_summary["School Type"].astype(str).str.strip("[]").str.replace("'", "")
final_school_summary["Total Students"] = final_school_summary["Total Students"].astype(int)
final_school_summary["Total School Budget"] = final_school_summary["Total School Budget"].map("${:,.2f}".format)
final_school_summary["Per Student Budget"] = final_school_summary["Per Student Budget"].astype(float)
final_school_summary["Average Math Score"] = final_school_summary["Average Math Score"].round(2)
final_school_summary["Average Reading Score"] = final_school_summary["Average Reading Score"].round(2)
final_school_summary["% Passing Maths"] = final_school_summary["% Passing Maths"].round(2)
final_school_summary["% Passing Reading"] = final_school_summary["% Passing Reading"].round(2)
final_school_summary["% Overall Passing"] = final_school_summary["% Overall Passing"].round(2)
final_school_summary.index.name = None

# final_school_summary["School Type"] = final_school_summary["School Type"].astype(str).str.strip("[]").str.replace("'", "")
# final_school_summary["Total Students"] = final_school_summary["Total Students"].map("{:,}".format)
# final_school_summary["Total School Budget"] = final_school_summary["Total School Budget"].map("${:,.2f}".format)
# final_school_summary["Per Student Budget"] = final_school_summary["Per Student Budget"].map("{:,.2f}".format)
# final_school_summary["Average Math Score"] = final_school_summary["Average Math Score"].map("{:.3f}".format)
# final_school_summary["Average Reading Score"] = final_school_summary["Average Reading Score"].map("{:.3f}".format)
# final_school_summary["% Passing Maths"] = final_school_summary["% Passing Maths"].map("{:.3f}".format)
# final_school_summary["% Passing Reading"] = final_school_summary["% Passing Reading"].map("{:.3f}".format)
# final_school_summary["% Overall Passing"] = final_school_summary["% Overall Passing"].map("{:.3f}".format)

final_school_summary

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Bailey High School,Government,4976,"$3,124,928.00",628.0,72.35,71.01,91.64,87.38,80.08
Cabrera High School,Independent,1858,"$1,081,356.00",582.0,71.66,71.36,90.85,89.07,80.79
Figueroa High School,Government,2949,"$1,884,411.00",639.0,68.7,69.08,81.65,82.81,67.65
Ford High School,Government,2739,"$1,763,916.00",644.0,69.09,69.57,82.44,82.22,67.47
Griffin High School,Independent,1468,"$917,500.00",625.0,71.79,71.25,91.21,88.49,81.34
Hernandez High School,Government,4635,"$3,022,020.00",652.0,68.87,69.19,80.95,81.88,66.36
Holden High School,Independent,427,"$248,087.00",581.0,72.58,71.66,89.93,88.52,78.92
Huang High School,Government,2917,"$1,910,635.00",655.0,68.94,68.91,81.69,81.45,66.71
Johnson High School,Government,4761,"$3,094,650.00",650.0,68.84,69.04,82.06,81.98,67.19
Pena High School,Independent,962,"$585,858.00",609.0,72.09,71.61,91.68,86.59,79.21


In [21]:
#TOP PERFORMING SCHOOLS BY % OVERALL PASSING
#use sort_values to sort the dataframe in descending order
#from highest to lowest order
#display the top 5 rows of the dataframe .head()

top_5_overall_passing = final_school_summary.sort_values("% Overall Passing", ascending=False)
top_schools = top_5_overall_passing 
top_schools.head()


Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Griffin High School,Independent,1468,"$917,500.00",625.0,71.79,71.25,91.21,88.49,81.34
Cabrera High School,Independent,1858,"$1,081,356.00",582.0,71.66,71.36,90.85,89.07,80.79
Bailey High School,Government,4976,"$3,124,928.00",628.0,72.35,71.01,91.64,87.38,80.08
Wright High School,Independent,1800,"$1,049,400.00",583.0,72.05,70.97,91.78,86.67,79.72
Rodriguez High School,Government,3999,"$2,547,363.00",637.0,72.05,70.94,90.8,87.4,79.42


In [22]:
#LOWEST PERFORMING SCHOOLS BY % OVERALL PASSING
#use sort_values to sort the dataframe in ascending order
#from lowest to highest order

lowest_5_overall_passing = final_school_summary.sort_values("% Overall Passing", ascending=True)
bottom_schools = lowest_5_overall_passing
bottom_schools.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Hernandez High School,Government,4635,"$3,022,020.00",652.0,68.87,69.19,80.95,81.88,66.36
Huang High School,Government,2917,"$1,910,635.00",655.0,68.94,68.91,81.69,81.45,66.71
Johnson High School,Government,4761,"$3,094,650.00",650.0,68.84,69.04,82.06,81.98,67.19
Wilson High School,Independent,2283,"$1,319,574.00",578.0,69.17,68.88,82.79,81.3,67.46
Ford High School,Government,2739,"$1,763,916.00",644.0,69.09,69.57,82.44,82.22,67.47


In [23]:
# MATHS SCORES BY YEAR
#listing the average math score for students of each year level (9, 10, 11, 12) at each school
#group the dataframe of year level by school name and the average maths score
#filter the data, by grabbing each year level by using .loc

year_9 = school_data_complete_df.loc[school_data_complete_df["year"] == 9]
year_10 = school_data_complete_df.loc[school_data_complete_df["year"] == 10]
year_11 = school_data_complete_df.loc[school_data_complete_df["year"] == 11]
year_12 = school_data_complete_df.loc[school_data_complete_df["year"] == 12]

year_9_maths = year_9.groupby(["school_name"])["maths_score"].mean().round(2)
year_10_maths = year_10.groupby(["school_name"])["maths_score"].mean().round(2)
year_11_maths = year_11.groupby(["school_name"])["maths_score"].mean().round(2)
year_12_maths = year_12.groupby(["school_name"])["maths_score"].mean().round(2)

#convert this into a dataframe;
math_scores_by_year = pd.DataFrame({ "Year 9": year_9_maths,
                                    "Year 10": year_10_maths,
                                    "Year 11": year_11_maths,
                                    "Year 12": year_12_maths
                                    })

math_scores_by_year

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,72.49,71.9,72.37,72.68
Cabrera High School,72.32,72.44,71.01,70.6
Figueroa High School,68.48,68.33,68.81,69.33
Ford High School,69.02,69.39,69.25,68.62
Griffin High School,72.79,71.09,71.69,71.47
Hernandez High School,68.59,68.87,69.15,68.99
Holden High School,70.54,75.11,71.64,73.41
Huang High School,69.08,68.53,69.43,68.64
Johnson High School,69.47,67.99,68.64,69.29
Pena High School,72.0,72.4,72.52,71.19


In [24]:
#READING SCORES BY YEAR
#listing the average reading score for students of each year level (9, 10, 11, 12) at each school
#group the dataframe of year level by school name, year level and the average reading score 
year_9 = school_data_complete_df.loc[school_data_complete_df["year"] == 9]
year_10 = school_data_complete_df.loc[school_data_complete_df["year"] == 10]
year_11 = school_data_complete_df.loc[school_data_complete_df["year"] == 11]
year_12 = school_data_complete_df.loc[school_data_complete_df["year"] == 12]

year_9_reading = year_9.groupby(["school_name"])["reading_score"].mean().round(2)
year_10_reading = year_10.groupby(["school_name"])["reading_score"].mean().round(2)
year_11_reading = year_11.groupby(["school_name"])["reading_score"].mean().round(2)
year_12_reading = year_12.groupby(["school_name"])["reading_score"].mean().round(2)

#convert this into dataframe;
reading_scores_by_year = pd.DataFrame({ "Year 9": year_9_reading,
                                    "Year 10": year_10_reading,
                                    "Year 11": year_11_reading,
                                    "Year 12": year_12_reading
                                    })

reading_scores_by_year

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,70.9,70.85,70.32,72.2
Cabrera High School,71.17,71.33,71.2,71.86
Figueroa High School,70.26,67.68,69.15,69.08
Ford High School,69.62,68.99,70.74,68.85
Griffin High School,72.03,70.75,72.39,69.43
Hernandez High School,68.48,70.62,68.42,69.24
Holden High School,71.6,71.1,73.31,70.48
Huang High School,68.67,69.52,68.74,68.67
Johnson High School,68.72,69.3,69.97,67.99
Pena High School,70.95,72.32,71.7,71.51


In [25]:
#SCORES BY SCHOOL SPENDING 
spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

final_school_summary["Spending Ranges (Per Student)"] = pd.cut(final_school_summary["Per Student Budget"],
                                                                      bins=spending_bins, labels=labels)


school_spending_index = final_school_summary.set_index("Spending Ranges (Per Student)")

spending_math_scores = school_spending_index.groupby(["Spending Ranges (Per Student)"])["Average Math Score"].mean().round(2)
spending_reading_scores = school_spending_index.groupby(["Spending Ranges (Per Student)"])["Average Reading Score"].mean().round(2)
spending_passing_math = school_spending_index.groupby(["Spending Ranges (Per Student)"])["% Passing Maths"].mean().round(2)
spending_passing_reading = school_spending_index.groupby(["Spending Ranges (Per Student)"])["% Passing Reading"].mean().round(2)
overall_passing_spending = school_spending_index.groupby(["Spending Ranges (Per Student)"])["% Overall Passing"].mean().round(2)

spending_summary = pd.DataFrame({
    "Average Math Score" : spending_math_scores,
    "Average Reading Score" : spending_reading_scores,
    "% Passing Maths" : spending_passing_math,
    "% Passing Reading" : spending_passing_reading,
    "% Overall Passing" : overall_passing_spending
})


spending_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,71.36,70.72,88.84,86.39,76.72
$585-630,72.06,71.03,91.52,87.29,79.88
$630-645,69.86,69.84,84.68,83.76,71.0
$645-680,68.88,69.05,81.57,81.77,66.75


In [26]:
#SCORES BY SCHOOL SIZE

size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

final_school_summary["School Size"] = pd.cut(final_school_summary["Total Students"],
                                                                      bins=size_bins, labels=labels)

school_size_index = final_school_summary.set_index("School Size")

schoolsize_math_scores = school_size_index.groupby(["School Size"])["Average Math Score"].mean().round(2)
schoolsize_reading_scores = school_size_index.groupby(["School Size"])["Average Reading Score"].mean().round(2)
schoolsize_passing_math = school_size_index.groupby(["School Size"])["% Passing Maths"].mean().round(2)
schoolsize_passing_reading = school_size_index.groupby(["School Size"])["% Passing Reading"].mean().round(2)
schoolsize_overall_passing = school_size_index.groupby(["School Size"])["% Overall Passing"].mean().round(2)

school_size_summary = pd.DataFrame({"Average Math Score" : schoolsize_math_scores,
                                    "Average Reading Score" : schoolsize_reading_scores,
                                    "% Passing Maths" : schoolsize_passing_math,
                                    "% Passing Reading" : schoolsize_passing_reading,
                                    "% Overall Passing" : schoolsize_overall_passing
                                    })

school_size_summary


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),72.34,71.63,90.8,87.56,79.06
Medium (1000-2000),71.42,70.72,89.85,86.71,78.04
Large (2000-5000),69.75,69.58,84.25,83.3,70.29


In [27]:
school_type_index = final_school_summary.set_index("School Type")

school_type_math_scores = school_type_index.groupby(["School Type"])["Average Math Score"].mean().round(2)
school_type_reading_scores = school_type_index.groupby(["School Type"])["Average Reading Score"].mean().round(2)
school_type_passing_math = school_type_index.groupby(["School Type"])["% Passing Maths"].mean().round(2)
school_type_passing_reading = school_type_index.groupby(["School Type"])["% Passing Reading"].mean().round(2)
school_type_overall_passing = school_type_index.groupby(["School Type"])["% Overall Passing"].mean().round(2)

school_type = pd.DataFrame({"Average Math Score" : school_type_math_scores,
                            "Average Reading Score" : school_type_reading_scores,
                            "% Passing Maths" : school_type_passing_math,
                            "% Passing Reading" : school_type_passing_reading,
                            "% Overall Passing" : school_type_overall_passing
})

school_type

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Government,69.83,69.68,84.46,83.59,70.7
Independent,71.37,70.72,89.2,86.25,76.97
