In [1]:
# ********** GENERAL SET UP ****************************************************

# Import libraries
import pandas as pd
import numpy as ny


In [2]:
# Define pathways to files
schools_csv = "raw_data/schools_complete.csv"
students_csv = "raw_data/students_complete.csv"

In [3]:
# Read the pathways and define the dataframe(s) - Part 1
schools_df = pd.read_csv(schools_csv)
schools_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
# Read the pathways and define the dataframe(s) - Part 2
students_df = pd.read_csv(students_csv)
students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [5]:
# ********** SET COMMON COLUMN AND MERGE FILES *********************************

In [6]:
#Rename School name in school_df to "school" so we can merge on "school" between the two dataframes
schools_df = schools_df.rename(columns={"name":"school"})
schools_df.head()

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [7]:
# Merge the two dataframes into a new, single dataframe
combined_df = pd.merge(schools_df, students_df, on="school")
combined_df.head()


Unnamed: 0,School ID,school,type,size,budget,Student ID,name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


In [8]:
# ********** GET GENERAL STATS FROM DATA ***************************************
#count total schools
total_schools = schools_df["School ID"].count()
total_schools

15

In [9]:
#count total students
total_students = students_df["Student ID"].count()
total_students

39170

In [10]:
#sum total budget
total_budget = schools_df["budget"].sum()
total_budget

24649428

In [11]:
#average math score
ave_math_score = students_df["math_score"].mean()
ave_math_score


78.98537145774827

In [12]:
#average reading score
ave_reading_score = students_df["reading_score"].mean()
ave_reading_score

81.87784018381414

In [13]:
# ********** CREATE BINS AND GROUPS SO CAN CALC PERCENTAGES********************
# Start with "% passing math" first____________________________________________
# Create bins
bins = [0,60,70,80,90,100]
score_range =["F","D","C","B","A"]
pd.cut(combined_df["math_score"],bins,labels=score_range).head()
                   

0    C
1    D
2    F
3    F
4    B
Name: math_score, dtype: category
Categories (5, object): [F < D < C < B < A]

In [14]:
# Place the data series into a new column inside of the DataFrame
combined_df["Math Grade Group"] = pd.cut(combined_df["math_score"],bins,labels=score_range)
combined_df.head()


Unnamed: 0,School ID,school,type,size,budget,Student ID,name,gender,grade,reading_score,math_score,Math Grade Group
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,C
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,D
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,F
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,F
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,B


In [15]:
# Create a GroupBy object based upon high school and "Math Grade Group"
math_grade_group = combined_df.groupby("Math Grade Group")

print(math_grade_group["Math Grade Group"].count())

Math Grade Group
F    3562
D    7252
C    9751
B    9771
A    8834
Name: Math Grade Group, dtype: int64


In [16]:
###################################################################################################
# The answers for each school can be collected via the .loc and groupby process below. I have been looking for ways to avoid having to do this for each school, but have not been succesful after much research. I am looking to do a groupby on the groupby and I have not been able to get this work yet. 
Bailey_filter = combined_df.loc[combined_df["school"] == "Bailey High School"]
print(Bailey_filter.count())

Bailey_math_grade_group = Bailey_filter.groupby("Math Grade Group")
print(Bailey_math_grade_group["Math Grade Group"].count())



School ID           4976
school              4976
type                4976
size                4976
budget              4976
Student ID          4976
name                4976
gender              4976
grade               4976
reading_score       4976
math_score          4976
Math Grade Group    4976
dtype: int64
Math Grade Group
F     625
D    1135
C    1093
B    1126
A     997
Name: Math Grade Group, dtype: int64


In [17]:
math_grade_group.mean()


Unnamed: 0_level_0,School ID,size,budget,Student ID,reading_score,math_score
Math Grade Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,6.953116,4059.609489,2609577.0,19511.761931,80.899495,57.517687
D,6.945119,3680.766271,2352352.0,19531.18064,81.664368,66.055985
C,6.974156,3134.985232,1983284.0,19573.217106,81.969131,75.475849
B,7.01965,3154.723467,1996162.0,19692.039607,82.134173,85.453178
A,6.973964,3170.097351,2007500.0,19551.107992,82.063278,94.975436
