In [1]:
# Dependencies and Setup
import pandas as pd

In [2]:
# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [3]:
# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

In [4]:
#examine the data
# school_data.head()

In [5]:
#examine the data
# student_data.head()

In [28]:
# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [7]:
# assign summary data to variables
schools_total = school_data_complete["School ID"].nunique()
student_total = school_data_complete["Student ID"].nunique()
total_budget = school_data_complete["budget"].unique().sum()
average_math = school_data_complete["math_score"].mean(axis=0)
average_reading = school_data_complete["reading_score"].mean(axis=0)
passing_math_count = school_data_complete.math_score[(school_data_complete.math_score  >= 70)].count()
passing_math_percent = passing_math_count / student_total * 100
passing_reading_count = school_data_complete.reading_score[(school_data_complete.reading_score >= 70)].count()
passing_reading_percent = passing_reading_count / student_total * 100 
passing_both_count = len(school_data_complete[(school_data_complete['math_score'] >= 70) & (school_data_complete['reading_score'] >= 70) ])
passing_both_percent = (passing_both_count / student_total) * 100


In [8]:
# create summary dataframe
combined_df = pd.DataFrame({"Total Schools": [schools_total],
                            "Total Students": [student_total],
                            "Total Budget":   [total_budget], # is this value correct?
                            "Average Math Score": [f'{round(average_math,1)}%'],
                            "Average Reading Score": [f'{round(average_reading, 1)}%'],
                            "Students Passing Math": [f'{round(passing_math_percent, 1)}%'],
                            "Students Passing Reading": [f'{round(passing_reading_percent, 1)}%'],
                            "Overall Passing": [f'{round(passing_both_percent, 1)}%']
                           
                           })
# formatting
combined_df['Total Budget'] = combined_df['Total Budget'].astype(int).apply(lambda x: "${:,}".format(x))
combined_df['Total Students'] = combined_df['Total Students'].astype(int).apply(lambda x: "{:,}".format(x))

# show df
combined_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Students Passing Math,Students Passing Reading,Overall Passing
0,15,39170,"$24,649,428",79.0%,81.9%,75.0%,85.8%,65.2%


In [34]:
# Group by schools
groupby_school = school_data_complete.groupby(['school_name'])
print(groupby_school.head())

# Create series for df
school_type = groupby_school['type'].unique()
size = groupby_school['size'].unique()
budget_per_school = groupby_school['budget'].mean()


       Student ID       student_name gender grade         school_name  \
0               0       Paul Bradley      M   9th   Huang High School   
1               1       Victor Smith      M  12th   Huang High School   
2               2    Kevin Rodriguez      M  12th   Huang High School   
3               3  Dr. Richard Scott      M  12th   Huang High School   
4               4         Bonnie Ray      F   9th   Huang High School   
...           ...                ...    ...   ...                 ...   
37535       37535         Norma Mata      F  10th  Thomas High School   
37536       37536        Cody Miller      M  11th  Thomas High School   
37537       37537        Erik Snyder      M   9th  Thomas High School   
37538       37538     Tanya Martinez      F   9th  Thomas High School   
37539       37539      Noah Erickson      M   9th  Thomas High School   

       reading_score  math_score  School ID      type  size   budget  
0                 66          79          0  Distric

school_name
Bailey High School       3124928
Cabrera High School      1081356
Figueroa High School     1884411
Ford High School         1763916
Griffin High School       917500
Hernandez High School    3022020
Holden High School        248087
Huang High School        1910635
Johnson High School      3094650
Pena High School          585858
Rodriguez High School    2547363
Shelton High School      1056600
Thomas High School       1043130
Wilson High School       1319574
Wright High School       1049400
Name: budget, dtype: int64

In [56]:
schools_summary_df = pd.concat([school_type, size, budget_per_school], axis = 1)
# schools_summary_df.head()
schools_summary_df['type'] = schools_summary_df['type'].str[0]
schools_summary_df['size'] = schools_summary_df['size'].astype(int)
schools_summary_df.head()

Unnamed: 0_level_0,type,size,budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,District,4976,3124928
Cabrera High School,Charter,1858,1081356
Figueroa High School,District,2949,1884411
Ford High School,District,2739,1763916
Griffin High School,Charter,1468,917500
