In [107]:
#add the pandas dependency
import pandas as pd

In [108]:
# Files to load
school_data_to_load = "resources/schools_complete.csv"
student_data_to_load = "resources/students_complete.csv"

In [109]:
#Read the school data file and store it in a pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)

In [110]:
# Read the student data file and store it in a Pandas DataFrame.
student_data_df = pd.read_csv(student_data_to_load)

In [111]:
#Creating new list with all the prefixes and suffixes in names to use for name cleaning
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [112]:
#removing all the professional suffixes and prefixes from student names
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

  This is separate from the ipykernel package so we can avoid doing imports until


In [113]:
#save the clean data to new csv file
student_data_df.to_csv("resources/clean_student_data.csv")

In [114]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])

In [115]:
#get the total number of students
student_count = school_data_complete_df["Student ID"].count()
student_count

39170

In [116]:
#get the total number of schools
school_count = len(set(school_data_complete_df["school_name"]))
school_count

15

In [117]:
#get total budget of all schools
total_budget = school_data_df["budget"].sum()
print(total_budget)

24649428


In [118]:
#get reading score average
average_reading_score = school_data_complete_df["reading_score"].mean()
average_reading_score

81.87784018381414

In [119]:
#get math score average
average_math_score = school_data_complete_df["math_score"].mean()
average_math_score

78.98537145774827

In [120]:
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math_count = passing_math["student_name"].count()
passing_math_count

29370

In [121]:
# Get all the students who are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
passing_reading_count = passing_reading["student_name"].count()
passing_reading_count

33610

In [122]:
#Get the percentage of students who passed math
passing_math_percentage = passing_math_count / student_count * 100
#Get the percentage of students who passed reading
passing_reading_percentage = passing_reading_count /student_count * 100

In [123]:
#Get number of students who passed both math and reading
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) 
                                               & (school_data_complete_df["reading_score"] >= 70)]
overall_passing_math_rading_count = passing_math_reading["student_name"].count()

In [124]:
#calculate overall passsing percentage
overall_passing_percentage = overall_passing_math_rading_count / student_count *100
overall_passing_percentage

65.17232575950983

In [125]:
#create new DataFrame to store all the results
district_summary_df = pd.DataFrame([{"Total Schools": school_count,
                                    "Total Students" : student_count,
                                   "Total Budget" : total_budget,
                                   "Average Reading Score": average_reading_score,
                                   "Average Math Score": average_math_score,
                                   "% Passing Reading": passing_reading_percentage,
                                   "% Passing Math": passing_math_percentage,
                                   "% Overall Passing": overall_passing_percentage}])

In [126]:
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
0,15,39170,24649428,81.87784,78.985371,85.805463,74.980853,65.172326


In [127]:
# Define a function that calculates the percentage of students that passed both 
# math and reading and returns the passing percentage when the function is called.

def passing_math_percent(pass_math_count, student_count):
    return pass_math_count / float(student_count) * 100

In [128]:
passing_math_percent(passing_math_count, student_count)

74.9808526933878

In [129]:
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

In [130]:
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
0,15,39170,"$24,649,428.00",81.9,79.0,86,75,65


In [131]:
new_column_order = ["Total Schools","Total Students","Total Budget","Average Math Score","Average Reading Score",
                   "% Passing Math","% Passing Reading","% Overall Passing"]

In [132]:
district_summary_df = district_summary_df[new_column_order]

In [133]:
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
