In [2]:
# Import Dependencies
import os
import csv
import pandas as pd

# Load the csv files into pandas dataframe
filepath = os.path.join("Resources", "schools_complete.csv")
school_name_df = pd.read_csv(filepath)
school_name_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:

filepath2 = os.path.join("Resources", "students_complete.csv")
student_name_df = pd.read_csv(filepath2)
student_name_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [4]:
# Total Number of schools
total_num_schools = school_name_df["school_name"].nunique()
total_num_schools

15

In [5]:
# Total number of students
total_num_students = int(student_name_df["Student ID"].nunique())
total_num_students

39170

In [6]:
# Total budget with sum function
total_budget = school_name_df["budget"].sum()
total_budget

24649428

In [7]:
# Calculate Average Math scores
average_math_score = round(student_name_df["math_score"].mean(),1)
average_math_score

79.0

In [9]:
# Average Reading Scores
average_reading_score = round(student_name_df["reading_score"].mean(),0)
average_reading_score

82.0

In [12]:
# Total number of students passing math (scores above 70)
student_pass_math = student_name_df[student_name_df["math_score"] > 70].count()["student_name"]
# Percent of Students Passing Maths
perc_stu_pass_math = round((student_pass_math/total_num_students)*100,0)
perc_stu_pass_math

72.0

In [13]:
# Total students that pass reading
student_pass_reading = student_name_df[student_name_df["reading_score"] >70 ].count()["student_name"]

# Percentage of students that pass reading
perc_pass_reading = round(((student_pass_reading/total_num_students)*100),1)
perc_pass_reading

83.0

In [14]:
# Overall Pass Rate
overall_pass_rate = (perc_stu_pass_math + perc_pass_reading)/2
overall_pass_rate

77.5

In [15]:
#Overview Table
dist_Sum_df = pd.DataFrame({"Total Schools":[total_num_schools],
                      "Total Students":[total_num_students],
                      "Total Budget":[total_budget],
                      "Average Math Score":[average_math_score],
                      'Average Reading Score':[average_reading_score],
                      "% Passing Math":[perc_stu_pass_math],
                      "% Passing Reading":[perc_pass_reading],
                      "Overall Passing rate":[overall_pass_rate]})
dist_Sum_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing rate
0,15,39170,24649428,79.0,82.0,72.0,83.0,77.5


In [None]:
School Summary

In [16]:
# Name of Schools and Type
type_of_school = school_name_df.set_index(["school_name"])["type"]
type_of_school

school_name
Huang High School        District
Figueroa High School     District
Shelton High School       Charter
Hernandez High School    District
Griffin High School       Charter
Wilson High School        Charter
Cabrera High School       Charter
Bailey High School       District
Holden High School        Charter
Pena High School          Charter
Wright High School        Charter
Rodriguez High School    District
Johnson High School      District
Ford High School         District
Thomas High School        Charter
Name: type, dtype: object

In [17]:
# Dataframe merge
merge_df = pd.merge(student_name_df,school_name_df,on="school_name")
merge_df.head()

# Total # of Students per School
total_student_per_school = merge_df["school_name"].value_counts()

In [18]:
# Total school Budget
bg_per_school = merge_df.groupby(["school_name"]).mean()["budget"]
bg_per_school

school_name
Bailey High School       3124928.0
Cabrera High School      1081356.0
Figueroa High School     1884411.0
Ford High School         1763916.0
Griffin High School       917500.0
Hernandez High School    3022020.0
Holden High School        248087.0
Huang High School        1910635.0
Johnson High School      3094650.0
Pena High School          585858.0
Rodriguez High School    2547363.0
Shelton High School      1056600.0
Thomas High School       1043130.0
Wilson High School       1319574.0
Wright High School       1049400.0
Name: budget, dtype: float64

In [20]:
# Budget Per Student
bg_per_stu = bg_per_school/total_student_per_school
bg_per_stu

Bailey High School       628.0
Cabrera High School      582.0
Figueroa High School     639.0
Ford High School         644.0
Griffin High School      625.0
Hernandez High School    652.0
Holden High School       581.0
Huang High School        655.0
Johnson High School      650.0
Pena High School         609.0
Rodriguez High School    637.0
Shelton High School      600.0
Thomas High School       638.0
Wilson High School       578.0
Wright High School       583.0
dtype: float64

In [21]:
# Calculate Avg Math Score
# Total Math Score per School
ams = round(merge_df.groupby(["school_name"]).mean()["math_score"],2)
ams

school_name
Bailey High School       77.05
Cabrera High School      83.06
Figueroa High School     76.71
Ford High School         77.10
Griffin High School      83.35
Hernandez High School    77.29
Holden High School       83.80
Huang High School        76.63
Johnson High School      77.07
Pena High School         83.84
Rodriguez High School    76.84
Shelton High School      83.36
Thomas High School       83.42
Wilson High School       83.27
Wright High School       83.68
Name: math_score, dtype: float64

In [22]:
# Calculate Average reading scores per school 
# Total reading scores per school
ars = round(merge_df.groupby(["school_name"]).mean()["reading_score"],2)
ars

school_name
Bailey High School       81.03
Cabrera High School      83.98
Figueroa High School     81.16
Ford High School         80.75
Griffin High School      83.82
Hernandez High School    80.93
Holden High School       83.81
Huang High School        81.18
Johnson High School      80.97
Pena High School         84.04
Rodriguez High School    80.74
Shelton High School      83.73
Thomas High School       83.85
Wilson High School       83.99
Wright High School       83.96
Name: reading_score, dtype: float64

In [25]:
# Number of students passing math by schools
student_pass_math = merge_df[(merge_df["math_score"] > 70)]
spmps = student_pass_math.groupby(["school_name"]).count()["student_name"]
perc_pass_math_schools = round((spmps/total_student_per_school)*100,2)
perc_pass_math_schools

Bailey High School       64.63
Cabrera High School      89.56
Figueroa High School     63.75
Ford High School         65.75
Griffin High School      89.71
Hernandez High School    64.75
Holden High School       90.63
Huang High School        63.32
Johnson High School      63.85
Pena High School         91.68
Rodriguez High School    64.07
Shelton High School      89.89
Thomas High School       90.21
Wilson High School       90.93
Wright High School       90.28
dtype: float64

In [26]:
# Find the % of students that pass reading according to school

# Find Total student who pass reading from combined dataset
student_pass_reading = merge_df[merge_df["reading_score"] > 70]

#find the students passing reading schoolwise by doing groupby
sprps = student_pass_reading.groupby(["school_name"]).count()["student_name"]

#percentage of students passing reading per school
perc_pass_reading_byschool = round((sprps/total_student_per_school)*100,2)
perc_pass_reading_byschool

Bailey High School       79.30
Cabrera High School      93.86
Figueroa High School     78.43
Ford High School         77.51
Griffin High School      93.39
Hernandez High School    78.19
Holden High School       92.74
Huang High School        78.81
Johnson High School      78.28
Pena High School         92.20
Rodriguez High School    77.74
Shelton High School      92.62
Thomas High School       92.91
Wilson High School       93.25
Wright High School       93.44
dtype: float64

In [27]:
# Calculate overall passing rate according to school
oprs = (perc_pass_math_schools + perc_pass_reading_byschool)/2
oprs

Bailey High School       71.965
Cabrera High School      91.710
Figueroa High School     71.090
Ford High School         71.630
Griffin High School      91.550
Hernandez High School    71.470
Holden High School       91.685
Huang High School        71.065
Johnson High School      71.065
Pena High School         91.940
Rodriguez High School    70.905
Shelton High School      91.255
Thomas High School       91.560
Wilson High School       92.090
Wright High School       91.860
dtype: float64

In [28]:
#creating a summary of the school data
school_summary_df = pd.DataFrame({"School Type":type_of_school,
                                 "Total Students":total_student_per_school,
                                 "Total School Budget":bg_per_school,
                                 "Budget per student":bg_per_stu,
                                 "Average Math Score Schoolwise":ams,
                                 "Average Reading Score Schoolwise":ars,
                                 "% Passing Math Schoolwise":perc_pass_math_schools,
                                 "% Passing Reading Schoolwise":perc_pass_reading_byschool,
                                 "Overall Passing rate":oprs})
school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate
Bailey High School,District,4976,3124928.0,628.0,77.05,81.03,64.63,79.3,71.965
Cabrera High School,Charter,1858,1081356.0,582.0,83.06,83.98,89.56,93.86,91.71
Figueroa High School,District,2949,1884411.0,639.0,76.71,81.16,63.75,78.43,71.09
Ford High School,District,2739,1763916.0,644.0,77.1,80.75,65.75,77.51,71.63
Griffin High School,Charter,1468,917500.0,625.0,83.35,83.82,89.71,93.39,91.55
Hernandez High School,District,4635,3022020.0,652.0,77.29,80.93,64.75,78.19,71.47
Holden High School,Charter,427,248087.0,581.0,83.8,83.81,90.63,92.74,91.685
Huang High School,District,2917,1910635.0,655.0,76.63,81.18,63.32,78.81,71.065
Johnson High School,District,4761,3094650.0,650.0,77.07,80.97,63.85,78.28,71.065
Pena High School,Charter,962,585858.0,609.0,83.84,84.04,91.68,92.2,91.94


In [29]:
# identify per school summary variable
summary_per_school = school_summary_df[["School Type",  "Total School Budget","Budget per student",
                                         "Average Math Score Schoolwise", "Average Reading Score Schoolwise", 
                                         "% Passing Math Schoolwise", "% Passing Reading Schoolwise", 
                                         "Overall Passing rate"]]

# Format so the Dollar sign is showing 
summary_per_school["Total School Budget"] = summary_per_school["Total School Budget"].map("${:,.2f}".format)
summary_per_school["Budget per student"] = summary_per_school["Budget per student"].map("${:,.2f}".format)

# Make sure data frame displays
summary_per_school

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_per_school["Total School Budget"] = summary_per_school["Total School Budget"].map("${:,.2f}".format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_per_school["Budget per student"] = summary_per_school["Budget per student"].map("${:,.2f}".format)


Unnamed: 0,School Type,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate
Bailey High School,District,"$3,124,928.00",$628.00,77.05,81.03,64.63,79.3,71.965
Cabrera High School,Charter,"$1,081,356.00",$582.00,83.06,83.98,89.56,93.86,91.71
Figueroa High School,District,"$1,884,411.00",$639.00,76.71,81.16,63.75,78.43,71.09
Ford High School,District,"$1,763,916.00",$644.00,77.1,80.75,65.75,77.51,71.63
Griffin High School,Charter,"$917,500.00",$625.00,83.35,83.82,89.71,93.39,91.55
Hernandez High School,District,"$3,022,020.00",$652.00,77.29,80.93,64.75,78.19,71.47
Holden High School,Charter,"$248,087.00",$581.00,83.8,83.81,90.63,92.74,91.685
Huang High School,District,"$1,910,635.00",$655.00,76.63,81.18,63.32,78.81,71.065
Johnson High School,District,"$3,094,650.00",$650.00,77.07,80.97,63.85,78.28,71.065
Pena High School,Charter,"$585,858.00",$609.00,83.84,84.04,91.68,92.2,91.94


In [None]:
Top Performing Schools by Passing Rate

In [30]:
top_schools = summary_per_school.sort_values(["Overall Passing rate"],ascending = False).head(5)
top_schools

Unnamed: 0,School Type,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate
Wilson High School,Charter,"$1,319,574.00",$578.00,83.27,83.99,90.93,93.25,92.09
Pena High School,Charter,"$585,858.00",$609.00,83.84,84.04,91.68,92.2,91.94
Wright High School,Charter,"$1,049,400.00",$583.00,83.68,83.96,90.28,93.44,91.86
Cabrera High School,Charter,"$1,081,356.00",$582.00,83.06,83.98,89.56,93.86,91.71
Holden High School,Charter,"$248,087.00",$581.00,83.8,83.81,90.63,92.74,91.685


In [None]:
Bottom 5 Performing Schools by Passing Rate

In [31]:
bottom_five_schools = summary_per_school.sort_values(["Overall Passing rate"],ascending = False).tail(5)
bottom_five_schools

Unnamed: 0,School Type,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate
Hernandez High School,District,"$3,022,020.00",$652.00,77.29,80.93,64.75,78.19,71.47
Figueroa High School,District,"$1,884,411.00",$639.00,76.71,81.16,63.75,78.43,71.09
Huang High School,District,"$1,910,635.00",$655.00,76.63,81.18,63.32,78.81,71.065
Johnson High School,District,"$3,094,650.00",$650.00,77.07,80.97,63.85,78.28,71.065
Rodriguez High School,District,"$2,547,363.00",$637.00,76.84,80.74,64.07,77.74,70.905


In [None]:
Math Scores 9th Through 12th

In [32]:
ams = round(merge_df.groupby(["grade"]).mean()["math_score"],2)
ams

grade
10th    78.94
11th    79.08
12th    78.99
9th     78.94
Name: math_score, dtype: float64

In [None]:
Reading Scores 9th through 12th

In [33]:
ars = round(merge_df.groupby(["grade"]).mean()["reading_score"],2)
ars

grade
10th    81.87
11th    81.89
12th    81.82
9th     81.91
Name: reading_score, dtype: float64

In [None]:
Student Scores by School Related to Spending

In [34]:
# Create bins where data will be placed
bins = [0, 585, 615, 645, 675]
names = ["<$585", "$585-615", "$615-645", "$645-675"]

In [35]:
# Categorize the spending per school 
summary_per_school["Spending Ranges (Per Student)"] = pd.cut(bg_per_stu,bins,labels = names)
summary_per_school

Unnamed: 0,School Type,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate,Spending Ranges (Per Student)
Bailey High School,District,"$3,124,928.00",$628.00,77.05,81.03,64.63,79.3,71.965,$615-645
Cabrera High School,Charter,"$1,081,356.00",$582.00,83.06,83.98,89.56,93.86,91.71,<$585
Figueroa High School,District,"$1,884,411.00",$639.00,76.71,81.16,63.75,78.43,71.09,$615-645
Ford High School,District,"$1,763,916.00",$644.00,77.1,80.75,65.75,77.51,71.63,$615-645
Griffin High School,Charter,"$917,500.00",$625.00,83.35,83.82,89.71,93.39,91.55,$615-645
Hernandez High School,District,"$3,022,020.00",$652.00,77.29,80.93,64.75,78.19,71.47,$645-675
Holden High School,Charter,"$248,087.00",$581.00,83.8,83.81,90.63,92.74,91.685,<$585
Huang High School,District,"$1,910,635.00",$655.00,76.63,81.18,63.32,78.81,71.065,$645-675
Johnson High School,District,"$3,094,650.00",$650.00,77.07,80.97,63.85,78.28,71.065,$645-675
Pena High School,Charter,"$585,858.00",$609.00,83.84,84.04,91.68,92.2,91.94,$585-615


In [None]:
Scores Based on Spending Ranges

In [None]:
# Average Math score by Spending = amss
# Average REading Score by Spending = arss
# Students that pass math based on spending = spms
# Students that pass reading based on spending = sprs
# The overall pass rate based on spending = oprs

In [36]:
amss = round(summary_per_school.groupby(["Spending Ranges (Per Student)"]).mean()["Average Math Score Schoolwise"],2)
arss = round(summary_per_school.groupby(["Spending Ranges (Per Student)"]).mean()["Average Reading Score Schoolwise"],2)
spms = round(summary_per_school.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Math Schoolwise"],2)
sprs= round(summary_per_school.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Reading Schoolwise"],2)
oprs = (spms + sprs)/2

In [37]:
scores_based_spending = pd.DataFrame({"Average Math Score (Spending)":amss,
                                     "Average Reading Score (Spending)":arss,
                                     "% Passing Math (Spending)":spms,
                                     "%Passing Reading (Spending)":sprs ,
                                     "Overall Passing rate (Spending)":oprs})

# Minor data Munging
scores_based_spending = scores_based_spending[["Average Math Score (Spending)","Average Reading Score (Spending)","% Passing Math (Spending)",
                         "%Passing Reading (Spending)","Overall Passing rate (Spending)"]]

# display the dataframe
scores_based_spending

Unnamed: 0_level_0,Average Math Score (Spending),Average Reading Score (Spending),% Passing Math (Spending),%Passing Reading (Spending),Overall Passing rate (Spending)
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.45,83.94,90.35,93.32,91.835
$585-615,83.6,83.88,90.78,92.41,91.595
$615-645,79.08,81.89,73.02,83.21,78.115
$645-675,77.0,81.03,63.97,78.43,71.2


In [None]:
Scores by School Size

In [38]:
# Create bins based on school size
bins_size = [0,1500,3000,5000]
labels_size = ["Small","Medium","Large"]

In [39]:
# categorise the dataframe
summary_per_school["Size of the school"] = pd.cut(school_summary_df["Total Students"],bins_size,labels = labels_size)
summary_per_school

Unnamed: 0,School Type,Total School Budget,Budget per student,Average Math Score Schoolwise,Average Reading Score Schoolwise,% Passing Math Schoolwise,% Passing Reading Schoolwise,Overall Passing rate,Spending Ranges (Per Student),Size of the school
Bailey High School,District,"$3,124,928.00",$628.00,77.05,81.03,64.63,79.3,71.965,$615-645,Large
Cabrera High School,Charter,"$1,081,356.00",$582.00,83.06,83.98,89.56,93.86,91.71,<$585,Medium
Figueroa High School,District,"$1,884,411.00",$639.00,76.71,81.16,63.75,78.43,71.09,$615-645,Medium
Ford High School,District,"$1,763,916.00",$644.00,77.1,80.75,65.75,77.51,71.63,$615-645,Medium
Griffin High School,Charter,"$917,500.00",$625.00,83.35,83.82,89.71,93.39,91.55,$615-645,Small
Hernandez High School,District,"$3,022,020.00",$652.00,77.29,80.93,64.75,78.19,71.47,$645-675,Large
Holden High School,Charter,"$248,087.00",$581.00,83.8,83.81,90.63,92.74,91.685,<$585,Small
Huang High School,District,"$1,910,635.00",$655.00,76.63,81.18,63.32,78.81,71.065,$645-675,Medium
Johnson High School,District,"$3,094,650.00",$650.00,77.07,80.97,63.85,78.28,71.065,$645-675,Large
Pena High School,Charter,"$585,858.00",$609.00,83.84,84.04,91.68,92.2,91.94,$585-615,Small


In [None]:
# Average Math score by size = avg_mss
# Average REading Score by size = avg_rss
# Percent of Students that pass math based on size = pspms
# percent of Students that pass reading based on size = psprs
# The overall pass rate based on size = opr

In [40]:
avg_mss = summary_per_school.groupby(["Size of the school"]).mean()["Average Math Score Schoolwise"]
avg_rss = summary_per_school.groupby(["Size of the school"]).mean()["Average Reading Score Schoolwise"]
pspms = summary_per_school.groupby(["Size of the school"]).mean()["% Passing Math Schoolwise"]
psprs = summary_per_school.groupby(["Size of the school"]).mean()["% Passing Reading Schoolwise"]
opr = summary_per_school.groupby(["Size of the school"]).mean()["Overall Passing rate"]

In [None]:
# create dataframe for scores by on school size

In [41]:
scores_school_size = pd.DataFrame({"Average Math Score":avg_mss,
                                  "Average Reading Score":avg_rss,
                                  "% Passing Math":pspms,
                                  "% Passing Reading":psprs,
                                  "Overall Passing rate":opr})
scores_school_size = scores_school_size[["Average Math Score","Average Reading Score",
                                        "% Passing Math","% Passing Reading",]]

In [None]:
Scores based on School 

In [None]:
# average students math score by school type = amst
# average students reading score by school type = arst
# Percent of students passing math by type = ppmt
# percent or student pass reading by type = pprt
# overall passing rate by type = ovpr


In [42]:
amst = summary_per_school.groupby(["School Type"]).mean()["Average Math Score Schoolwise"]
arst = summary_per_school.groupby(["School Type"]).mean()["Average Reading Score Schoolwise"]
ppmt = summary_per_school.groupby(["School Type"]).mean()["% Passing Math Schoolwise"]
pprt = summary_per_school.groupby(["School Type"]).mean()["% Passing Reading Schoolwise"]
ovpr = summary_per_school.groupby(["School Type"]).mean()["Overall Passing rate"]

In [43]:
scores_school_type = pd.DataFrame({"Average Math Score":amst,
                                  "Average Reading Score":arst,
                                  "% Passing Math":ppmt,
                                  "% Passing Reading":pprt,
                                  "Overall Passing rate":ovpr})
scores_school_type = scores_school_type[["Average Math Score","Average Reading Score",
                                        "% Passing Math","% Passing Reading","Overall Passing rate"]]
scores_school_type

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.4725,83.8975,90.36125,93.05125,91.70625
District,76.955714,80.965714,64.302857,78.322857,71.312857
