In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# CVS File location
schools_rdata = "raw_data/schools_complete.csv"
students_rdata = "raw_data/students_complete.csv"

In [3]:
# format decimals
pd.options.display.float_format = '{:,.2f}'.format

# read Schools data
schools_pd = pd.read_csv(schools_rdata)
schools_pd.head()

# rename column 'name' to 'school'
school = schools_pd.rename(columns={"name": "school"})
school.head()

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
# read Students data
students_pd = pd.read_csv(students_rdata)

In [5]:
# summary of data
total_schools = len(school)
total_schools

total_students = students_pd["name"].count()
total_students

total_budget = school["budget"].sum()
total_budget

avg_math_score = students_pd["math_score"].mean()
avg_math_score

avg_reading_score = students_pd["reading_score"].mean()
avg_reading_score

# assumption: the passing grade is 65 or a D in an American high school

passed_math = students_pd.loc[students_pd["math_score"] >= 65] ["math_score"].count()
passed_math

percent_passed_math = (passed_math/total_students) * 100
percent_passed_math

passed_reading =  students_pd.loc[students_pd["reading_score"] >= 65] ["reading_score"].count()
passed_reading

percent_passed_reading = (passed_reading/total_students) * 100
percent_passed_reading

overall_passing_rate = (percent_passed_reading + percent_passed_math) / 2
overall_passing_rate

district_summary1 = pd.DataFrame({"Total Schools": [total_schools],
                                "Total Students": [total_students],
                                "Total Budget": [total_budget],
                                "Average Math Score": [avg_math_score],
                                "Average Reading Score": [avg_reading_score],
                                "% Passing Math": [percent_passed_math],
                                "% Passing Reading": [percent_passed_reading],
                                "% Overall Passing Rate": [overall_passing_rate]})
district_summary1


district_summary = district_summary1[["Total Schools","Total Students","Total Budget","Average Math Score","Average Reading Score","% Passing Math","% Passing Reading", "% Overall Passing Rate"]]
district_summary.head()

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,24649428,78.99,81.88,84.73,96.2,90.46


In [6]:
# school summary

school.rename(columns = {'name':'school'}, inplace=True)

merged_df = students_pd.merge(school, how='left', on='school')
merged_df.head(10)

grouped_school = merged_df.groupby(['school'])
grouped_school.count().head(10)

groupby_school = merged_df['budget'].groupby(merged_df['school'])
groupby_school

groupby_school.mean().head(10)

school_summary1 = merged_df.groupby(['school'])
school_summary1

# school types
school_types = school.set_index('school')['type']

# total students by school
student_byschool = grouped_school['Student ID'].count()

# school budget
school_budget = school.set_index('school')['budget']

# per student budget
student_budget = school.set_index('school')['budget']/school.set_index('school')['size']

# average scores by school
avg_mathscores = grouped_school['math_score'].mean()
avg_readscores = grouped_school['reading_score'].mean()

# % passing scores
perc_pass_math = (merged_df[merged_df['math_score'] >= 65].groupby('school')['Student ID'].count()/student_byschool) * 100 
perc_pass_read = (merged_df[merged_df['reading_score'] >= 65].groupby('school')['Student ID'].count()/student_byschool ) * 100
overall = (merged_df[(merged_df['reading_score'] >= 65) & (merged_df['math_score'] >= 65)].groupby('school')['Student ID'].count()/student_byschool) * 100

school_summary1 = pd.DataFrame({
    "School Type": school_types,
    "Total Students": student_byschool,
    "Per Student Budget": student_budget,
    "Total School Budget": school_budget,
    "Average Math Score": avg_mathscores,
    "Average Reading Score": avg_readscores,
    '% Passing Math': perc_pass_math,
    '% Passing Reading': perc_pass_read,
    "Overall Passing Rate": overall
})

school_summary1

school_summary = school_summary1[['School Type', 
                          'Total Students', 
                          'Total School Budget', 
                          'Per Student Budget', 
                          'Average Math Score', 
                          'Average Reading Score',
                          '% Passing Math',
                          '% Passing Reading',
                          'Overall Passing Rate']]

school_summary

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Bailey High School,District,4976,3124928,628.0,77.05,81.03,77.91,94.55,73.75
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,100.0,100.0,100.0
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,77.18,94.54,72.84
Ford High School,District,2739,1763916,644.0,77.1,80.75,78.2,93.87,73.57
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,100.0,100.0,100.0
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,77.73,94.61,73.55
Holden High School,Charter,427,248087,581.0,83.8,83.81,100.0,100.0,100.0
Huang High School,District,2917,1910635,655.0,76.63,81.18,77.72,94.48,73.26
Johnson High School,District,4761,3094650,650.0,77.07,80.97,77.97,94.48,73.75
Pena High School,Charter,962,585858,609.0,83.84,84.04,100.0,100.0,100.0


In [7]:
# extract the top performing schools
top_five_schools = school_summary.sort_values("Overall Passing Rate", ascending = False)
top_five_schools.head(5)

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,100.0,100.0,100.0
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,100.0,100.0,100.0
Holden High School,Charter,427,248087,581.0,83.8,83.81,100.0,100.0,100.0
Pena High School,Charter,962,585858,609.0,83.84,84.04,100.0,100.0,100.0
Shelton High School,Charter,1761,1056600,600.0,83.36,83.73,100.0,100.0,100.0


In [8]:
# lower five performing schools
lower_five_schools = top_five_schools.tail()
lower_five_schools = lower_five_schools.sort_values("Overall Passing Rate")
lower_five_schools

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,77.18,94.54,72.84
Huang High School,District,2917,1910635,655.0,76.63,81.18,77.72,94.48,73.26
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,77.73,94.61,73.55
Ford High School,District,2739,1763916,644.0,77.1,80.75,78.2,93.87,73.57
Rodriguez High School,District,3999,2547363,637.0,76.84,80.74,77.94,94.62,73.62


In [9]:
# math scores by grade
twelfthgrade = students_pd.loc[students_pd["grade"] == "12th"].groupby("school")["math_score"].mean()
eleventhgrade = students_pd.loc[students_pd["grade"] == "11th"].groupby("school")["math_score"].mean()
tenthgrade = students_pd.loc[students_pd["grade"] == "10th"].groupby("school")["math_score"].mean()
ninthgrade = students_pd.loc[students_pd["grade"] == "9th"].groupby("school")["math_score"].mean()

math_scores = pd.DataFrame({
        "9th": ninthgrade,
        "10th": tenthgrade,
        "11th": eleventhgrade,
        "12th": twelfthgrade
})

math_scores = math_scores[["9th", "10th", "11th", "12th"]]
math_scores.index.name = " "
math_scores

Unnamed: 0,9th,10th,11th,12th
,,,,
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86


In [10]:
# reading scores by grade
twelfthgrade = students_pd.loc[students_pd["grade"] == "12th"].groupby("school")["reading_score"].mean()
eleventhgrade = students_pd.loc[students_pd["grade"] == "11th"].groupby("school")["reading_score"].mean()
tenthgrade = students_pd.loc[students_pd["grade"] == "10th"].groupby("school")["reading_score"].mean()
ninthgrade = students_pd.loc[students_pd["grade"] == "9th"].groupby("school")["reading_score"].mean()

reading_scores = pd.DataFrame({
        "9th": ninthgrade,
        "10th": tenthgrade,
        "11th": eleventhgrade,
        "12th": twelfthgrade
})

reading_scores = reading_scores[["9th", "10th", "11th", "12th"]]
reading_scores.index.name = " "
reading_scores

Unnamed: 0,9th,10th,11th,12th
,,,,
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23


In [11]:
# Scores By School Spending
# create four bins for data
bins = [0, 580, 605, 630, 655]

# create names for four bins
group_names = ["Less than $580", "Medium $580-605", "Average $605-630", "Above Average $630-655"]


pd.cut(school_summary["Per Student Budget"], bins, labels=group_names)

school_summary["Spending Budget (Per Student)"] = pd.cut(school_summary["Per Student Budget"], bins, labels=group_names)
school_summary

spending_group = school_summary.groupby("Spending Budget (Per Student)")["Average Math Score","Average Reading Score","% Passing Math","% Passing Reading","Overall Passing Rate"]
spending_group.max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Spending Budget (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Less than $580,83.27,83.99,100.0,100.0,100.0
Medium $580-605,83.8,83.98,100.0,100.0,100.0
Average $605-630,83.84,84.04,100.0,100.0,100.0
Above Average $630-655,83.42,83.85,100.0,100.0,100.0


In [12]:
# Scores by School Size
bins = [0, 500, 2750, 5000]

group_names = ["Small (<500)", "Medium (500-2750)", "Large(2750-5000)"]

pd.cut(school_summary["Total Students"], bins, labels=group_names)

school_summary["School Size"] = pd.cut(school_summary["Total Students"], bins, labels=group_names)
school_summary

size_group = school_summary.groupby("School Size")["Average Math Score","Average Reading Score","% Passing Math","% Passing Reading","Overall Passing Rate"]
size_group.max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<500),83.8,83.81,100.0,100.0,100.0
Medium (500-2750),83.84,84.04,100.0,100.0,100.0
Large(2750-5000),77.29,81.18,77.97,94.62,73.75


In [13]:
# Scores by School Type (District/Charter)
bins = ["Charter", "District"]

group_names = ["Charter", "District"]

size_group = school_summary.groupby("School Type")["Average Math Score","Average Reading Score","% Passing Math","% Passing Reading","Overall Passing Rate"]
size_group.max()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.84,84.04,100.0,100.0,100.0
District,77.29,81.18,78.2,94.62,73.75
