In [1]:
# Dependencies
import pandas as pd
import numpy as np

# Files to Load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# create indiv dfs
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the school and student dfs into a single dataset
# using a left join to preserve school data, and leave out extraneous student data
schools_and_students_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

schools_and_students_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


## District Summary

* Create a dataframe to hold the following calculations:

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Clean up formatting

In [2]:
#number of schools
#listing out all the unique schools
list_of_schools = schools_and_students_df['school_name'].unique()
#using len to find the number of schools w/in the unique list
num_schools = len(list_of_schools)
num_schools

15

In [3]:
#number of students
#using length of student ID column
num_students = len(schools_and_students_df['Student ID'])
num_students

39170

In [4]:
#total budget
list_of_budgets = schools_and_students_df['budget'].unique()
#ensuring the unique values match the number of schools
print(len(list_of_budgets))
#finding the total budget amount
total_budget = sum(list_of_budgets)
total_budget

15


24649428

In [5]:
#avg mathematics score
avg_maths = schools_and_students_df['math_score'].mean()
avg_maths 

78.98537145774827

In [6]:
#avg reading
avg_reading = schools_and_students_df['reading_score'].mean()
avg_reading 

81.87784018381414

In [7]:
#% passing maths
# number of students passing/total students
#boolean mask to filter students >70
passing_maths_mask = schools_and_students_df['math_score'] >= 70
#creating a df with that mask
students_passing_maths_df = schools_and_students_df.loc[passing_maths_mask, :]
#finding the length of the df of students passing (=number of students passing)
students_passing_maths = len(schools_and_students_df.loc[passing_maths_mask, :])
#applying % formula
maths_percent_passing = students_passing_maths/num_students * 100
maths_percent_passing 

74.9808526933878

In [8]:
#% passing reading
# number of students passing/total students
#boolean mask to filter students >70
passing_reading_mask = schools_and_students_df['reading_score'] >= 70
#creating a df with that mask
students_passing_reading_df = schools_and_students_df.loc[passing_reading_mask, :]
#finding the length of the df of students passing (=number of students passing)
students_passing_reading = len(schools_and_students_df.loc[passing_reading_mask, :])
#applying % formula
reading_percent_passing = students_passing_reading/num_students * 100
reading_percent_passing 

85.80546336482001

In [9]:
#% passing both
# number of students passing/total students
#boolean mask to filter students >70
passing_both_mask = (schools_and_students_df['math_score'] >= 70) & (schools_and_students_df['reading_score'] >= 70)
#creating a df with that mask
students_passing_both_df = schools_and_students_df.loc[passing_both_mask, :]
#finding the length of the df of students passing (=number of students passing)
students_passing_both = len(schools_and_students_df.loc[passing_both_mask, :])
#applying % formula
overall_percent_passing = students_passing_both/num_students * 100
overall_percent_passing 

65.17232575950983

In [10]:
#district summary
#creating a dictionary of lists to hold the columns
summary_dict = {'total_schools': [num_schools], 'total_students': [num_students], 'total_budget': [total_budget], 'average_math_score': [avg_maths], 'average_reading_score': [avg_reading], 'percent_passing_math': [maths_percent_passing], 'percent_passing_reading': [reading_percent_passing], 'overall_percent_passing': [overall_percent_passing]}
#creating the df
district_summary_df = pd.DataFrame(summary_dict)

In [11]:
#creating a copy df for formatting
district_summary_df_F = district_summary_df.copy()
#formatting the columns
#integers needing commas
district_summary_df_F.iloc[:, 0:2] = district_summary_df_F.iloc[:, 0:2].applymap('{:,}'.format)
#currency requiring decimal places and currency sign
district_summary_df_F.iloc[:, 2:3] = district_summary_df_F.iloc[:, 2:3].applymap('${:,.2f}'.format)
#decimals needing rounding
district_summary_df_F.iloc[:, 3:5] = district_summary_df_F.iloc[:, 3:5].applymap('{:.2f}'.format)
#decimals needing percent signs and rounding
district_summary_df_F.iloc[:, 5:8] = district_summary_df_F.iloc[:, 5:8].applymap('{:.2f}%'.format)
district_summary_df_F

Unnamed: 0,total_schools,total_students,total_budget,average_math_score,average_reading_score,percent_passing_math,percent_passing_reading,overall_percent_passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98%,85.81%,65.17%


## School Summary

* Create an overview df that includes:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)

In [12]:
#setting lists and a function for appending to list
name_list = []
type_list = []
total_students_list = []
total_budget_list = []
per_student_budget_list = []
avg_maths_list = []
avg_reading_list = []
perc_maths_list = []
perc_reading_list = []
perc_overall_list = []

def append_to_list(my_var, my_var_list):
    for x in range(0, len(my_var)):
        my_var_list.append(my_var[x])


In [13]:
#creating a groupby to collect info about the school
#grouped by object
grouped_schools_df = schools_and_students_df.groupby('school_name')
#collecting school name
school_name = grouped_schools_df['school_name'].unique()
append_to_list(school_name, name_list)
#school type
school_type = grouped_schools_df['type'].unique()
append_to_list(school_type, type_list)
#total students
total_students = grouped_schools_df['Student ID'].count()
append_to_list(total_students, total_students_list)
#total budget
total_budget = grouped_schools_df['budget'].unique()
#converting total budget to a number
total_budget = total_budget.astype('float64')
append_to_list(total_budget, total_budget_list)
#budget per student
budget_per_student = total_budget/total_students
append_to_list(budget_per_student, per_student_budget_list)
#finding avg maths scores - sum of scores over total students
maths_scores_sum = grouped_schools_df['math_score'].sum()
avg_maths_score = maths_scores_sum/total_students
append_to_list(avg_maths_score, avg_maths_list)
#finding avg reading scores - sum of scores over total students
reading_scores_sum = grouped_schools_df['reading_score'].sum()
avg_reading_score = reading_scores_sum/total_students
append_to_list(avg_reading_score, avg_reading_list)

In [14]:
#new groupby for maths w/ mask
grouped_maths = students_passing_maths_df.groupby('school_name')
#applying .count() to the groupby object to get the number of students passing
num_passing_maths = grouped_maths['math_score'].count()
#perc for maths - dividing by total students since all students have a maths score
perc_maths = num_passing_maths/total_students * 100
#setting number as decimal to allow for continued calcuations
perc_maths = perc_maths.astype('float64')
append_to_list(perc_maths, perc_maths_list)

In [15]:
#new groupby for reading w/ mask
grouped_reading = students_passing_reading_df.groupby('school_name')
#applying .count() to the groupby object to get the number of students passing
num_passing_reading = grouped_reading['reading_score'].count()
#perc for reading - dividing by total students since all students have a reading score
perc_reading = num_passing_reading/total_students * 100
#setting number as decimal to allow for continued calcuations
perc_reading = perc_reading.astype('float64')
append_to_list(perc_reading, perc_reading_list)

In [16]:
#new grouby for overall w/ mask
grouped_overall = students_passing_both_df.groupby('school_name')
#counting either number of reading scores or maths scores would work, as the columns are the same length
#applying .count() to the groupby object to get the number of students passing
num_passing_overall = grouped_overall['math_score'].count()
#perc for reading - dividing by total students since all students have scores for both
perc_passing_overall = num_passing_overall/total_students * 100
#setting number as decimal to allow for continued calcuations
perc_passing_overall = perc_passing_overall.astype('float64')
append_to_list(perc_passing_overall, perc_overall_list)

In [17]:
#creating a dictionary to hold the school values
schools_dict = {'schoolName': name_list, 'schoolType': type_list, 'totalStudents': total_students_list, 'totalBudget': total_budget_list, 'perStudentBudget': per_student_budget_list, 'averageMathScore': avg_maths_list, 'averageReadingScore': avg_reading_list, 'percentPassingMath': perc_maths_list, 'percentPassingReading': perc_reading_list, 'percentPassingOverall': perc_overall_list} 
schools_dict_df = pd.DataFrame(schools_dict)

#removing the brackets from the string columns
schools_dict_df['schoolName'] = schools_dict_df['schoolName'].str[0]
schools_dict_df['schoolType'] = schools_dict_df['schoolType'].str[0]

#setting index as schoolName
schools_dict_df = schools_dict_df.set_index('schoolName')

In [18]:
#formatting the schools df
#creating a new DF
schools_dict_df_F = schools_dict_df.copy()

#formatting the columns
#integers needing commas (total students)
schools_dict_df_F.iloc[:, 1] = schools_dict_df.iloc[: , 1].apply('{:,}'.format)
#currency needing commas, decimals and dollar sign
schools_dict_df_F.iloc[:, 2:4] = schools_dict_df.iloc[: , 2:4].applymap('${:,.2f}'.format)

schools_dict_df_F

Unnamed: 0_level_0,schoolType,totalStudents,totalBudget,perStudentBudget,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
schoolName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855,96.252927,89.227166
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541


## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [19]:
#applying .sort_values() on percentPassing, ascending=False so top schools appear at the top
#resetting index so it can be grabbed in next line to show top 5 schools
top_schools = schools_dict_df.sort_values('percentPassingOverall', ascending=False).reset_index(drop=False)
#choosing top 5 values (index 0 to 4) to get the top 5 schools
top_schools = top_schools.iloc[0:5, :]
#setting index as school name for organisation
top_schools = top_schools.set_index('schoolName')
top_schools

Unnamed: 0_level_0,schoolType,totalStudents,totalBudget,perStudentBudget,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
schoolName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Thomas High School,Charter,1635,1043130.0,638.0,83.418349,83.84893,93.272171,97.308869,90.948012
Griffin High School,Charter,1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
Pena High School,Charter,962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [20]:
#same sort_values funct as above, but ascending=True so the bottom schools appear at the top
bottom_schools = schools_dict_df.sort_values('percentPassingOverall', ascending=True).reset_index(drop=False)
bottom_schools = bottom_schools.iloc[0:5, :]
bottom_schools = bottom_schools.set_index('schoolName')
bottom_schools

Unnamed: 0_level_0,schoolType,totalStudents,totalBudget,perStudentBudget,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
schoolName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363.0,637.0,76.842711,80.744686,66.366592,80.220055,52.988247
Figueroa High School,District,2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Huang High School,District,2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
Hernandez High School,District,4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
Johnson High School,District,4761,3094650.0,650.0,77.072464,80.966394,66.057551,81.222432,53.539172


### Ranked Schools
* Looking into schools in-between the top and bottom 5
* Looking into budget in schools to see if there is overlap between budget and high scores

In [21]:
ranked_schools = schools_dict_df.sort_values('percentPassingOverall', ascending=False).reset_index(drop=False)
ranked_schools

Unnamed: 0,schoolName,schoolType,totalStudents,totalBudget,perStudentBudget,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
0,Cabrera High School,Charter,1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
1,Thomas High School,Charter,1635,1043130.0,638.0,83.418349,83.84893,93.272171,97.308869,90.948012
2,Griffin High School,Charter,1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
3,Wilson High School,Charter,2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
4,Pena High School,Charter,962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,90.540541
5,Wright High School,Charter,1800,1049400.0,583.0,83.682222,83.955,93.333333,96.611111,90.333333
6,Shelton High School,Charter,1761,1056600.0,600.0,83.359455,83.725724,93.867121,95.854628,89.892107
7,Holden High School,Charter,427,248087.0,581.0,83.803279,83.814988,92.505855,96.252927,89.227166
8,Bailey High School,District,4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
9,Ford High School,District,2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,54.289887


In [22]:
ranked_budget_schools = schools_dict_df.sort_values('totalBudget', ascending=False).reset_index(drop=False)
ranked_budget_schools

Unnamed: 0,schoolName,schoolType,totalStudents,totalBudget,perStudentBudget,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
0,Bailey High School,District,4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
1,Johnson High School,District,4761,3094650.0,650.0,77.072464,80.966394,66.057551,81.222432,53.539172
2,Hernandez High School,District,4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
3,Rodriguez High School,District,3999,2547363.0,637.0,76.842711,80.744686,66.366592,80.220055,52.988247
4,Huang High School,District,2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
5,Figueroa High School,District,2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
6,Ford High School,District,2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,54.289887
7,Wilson High School,Charter,2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
8,Cabrera High School,Charter,1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
9,Shelton High School,Charter,1761,1056600.0,600.0,83.359455,83.725724,93.867121,95.854628,89.892107


## Math Scores by Grade

* Create df (clean up formatting) listing average Math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

In [23]:
#lists to hold the values for each grade and school
mscores9 = []
mschools9 = []
mscores10 = []
mschools10 = []
mscores11 = []
mschools11 = []
mscores12 = []
mschools12 = []

#loop to go through the original df to extract the scores by grade
def loop_collector(scores, schools, grade):
    for x in range(0, (len(schools_and_students_df))):
        if schools_and_students_df.iloc[x, 3] == grade:
            scores.append(schools_and_students_df.iloc[x, 6])
            schools.append(schools_and_students_df.iloc[x, 4])

#calling the function for each of the years
#creating a series for each of the years, where the index is the schools
loop_collector(mscores9, mschools9, '9th')
series9 = pd.Series(np.array(mscores9), index=mschools9, dtype='float64')

loop_collector(mscores10, mschools10, '10th')
series10 = pd.Series(np.array(mscores10), index=mschools10, dtype='float64')

loop_collector(mscores11, mschools11, '11th')
series11 = pd.Series(np.array(mscores11), index=mschools11, dtype='float64')

loop_collector(mscores12, mschools12, '12th')
series12 = pd.Series(np.array(mscores12), index=mschools12, dtype='float64')


#grouping the series by schools, finding the avg for each school
grouped9maths = series9.groupby(mschools9).mean()
grouped10maths = series10.groupby(mschools10).mean()
grouped11maths = series11.groupby(mschools11).mean()
grouped12maths = series12.groupby(mschools12).mean()

#creating a DF
maths_scores_df = pd.DataFrame({'9th': grouped9maths, '10th': grouped10maths, '11th': grouped11maths, '12th': grouped12maths})
maths_scores_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


## Reading Score by Grade 

* Perform the same operations as above for reading scores

In [24]:
#lists to hold the values for each grade and school
rscores9 = []
rschools9 = []
rscores10 = []
rschools10 = []
rscores11 = []
rschools11 = []
rscores12 = []
rschools12 = []

#loop to go through the original df to extract the scores by grade
def loop_collector(scores, schools, grade):
    for x in range(0, (len(schools_and_students_df))):
        if schools_and_students_df.iloc[x, 3] == grade:
            scores.append(schools_and_students_df.iloc[x, 5])
            schools.append(schools_and_students_df.iloc[x, 4])

#calling the function for each of the years
#creating a series for each of the years, where the index is the schools
loop_collector(rscores9, rschools9, '9th')
series9 = pd.Series(np.array(rscores9), index=rschools9, dtype='float64')

loop_collector(rscores10, rschools10, '10th')
series10 = pd.Series(np.array(rscores10), index=rschools10, dtype='float64')

loop_collector(rscores11, rschools11, '11th')
series11 = pd.Series(np.array(rscores11), index=rschools11, dtype='float64')

loop_collector(rscores12, rschools12, '12th')
series12 = pd.Series(np.array(rscores12), index=rschools12, dtype='float64')

#grouping the series by schools, finding the avg for each school
grouped9reading = series9.groupby(rschools9).mean()
grouped10reading = series10.groupby(rschools10).mean()
grouped11reading = series11.groupby(rschools11).mean()
grouped12reading = series12.groupby(rschools12).mean()

#creating a DF
reading_scores_df = pd.DataFrame({'9th': grouped9reading, '10th': grouped10reading, '11th': grouped11reading, '12th': grouped12reading})
reading_scores_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


## Scores by School Spending

* Create df breaking down school performances based on average spending ranges (per student). Include:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [25]:
#using .describe to come up with sensible bins, linked to quartiles
schools_dict_df['perStudentBudget'].describe()
bins = [0,591.5,628, 641.5, 656]
amts = ['<$592', '$593-629', '$630-642', '>$642']

spending_amt = []
maths_scores = []
rdng_scores = []
perc_maths = []
perc_rdng = []
perc_overall = []

#function for collecting data on the schools, based on the spending range, taken from school_dict_df
def loop_collector(cut_number1, cut_number2=657):  
    for x in range(0, len(schools_dict_df)):
        if (schools_dict_df.iloc[x, 3] > cut_number1) & (schools_dict_df.iloc[x, 3] <= cut_number2):
            spending_amt.append(schools_dict_df.iloc[x, 3])
            maths_scores.append(schools_dict_df.iloc[x, 4])
            rdng_scores.append(schools_dict_df.iloc[x, 5])
            maths_perc = schools_dict_df.iloc[x, 6]
            reading_perc = schools_dict_df.iloc[x, 7]
            overall_perc = schools_dict_df.iloc[x, 7]
            perc_maths.append(schools_dict_df.iloc[x, 6])
            perc_rdng.append(schools_dict_df.iloc[x, 7])
            perc_overall.append(overall_perc)
    
#calling the function for the different ranges            
loop_collector(0, 591.5)
loop_collector(591.5, 628)
loop_collector(628, 641.5)
loop_collector(641.5)

#creating a series (which will form columns) for each category of value
series_spending_list = pd.Series(np.array(spending_amt), dtype='float64')
series_maths_avgs = pd.Series(np.array(maths_scores), dtype='float64')
series_reading_avgs = pd.Series(np.array(rdng_scores), dtype='float64')
series_maths_percs = pd.Series(np.array(perc_maths), dtype='float64')
series_reading_percs = pd.Series(np.array(perc_rdng), dtype='float64')
series_overall_percs = pd.Series(np.array(perc_overall), dtype='float64')

#creating a df of all the different series
spending_ranges_df = pd.DataFrame({'spendingRanges(perStudent)': series_spending_list, 'averageMathScore': series_maths_avgs, 'averageReadingScore': series_reading_avgs, 'percentPassingMath': series_maths_percs, 'percentPassingReading': series_reading_percs, 'percentPassingOverall': series_overall_percs})
#seperating the df into bins
spending_ranges_df['spendingRanges(perStudent)'] = pd.cut(spending_ranges_df['spendingRanges(perStudent)'], bins, labels = amts)
#grouping by the spending ranges and finding the avg of each column
spending_ranges_df.groupby('spendingRanges(perStudent)').mean()

Unnamed: 0_level_0,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
spendingRanges(perStudent),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$592,83.455399,83.933814,93.460096,96.610877,96.610877
$593-629,81.899826,83.155286,87.133538,92.718205,92.718205
$630-642,78.990942,81.917212,75.209078,86.089386,86.089386
>$642,77.023555,80.957446,66.70101,80.675217,80.675217


## Scores by School Size

* Perform the same operations as above, based on school size.

In [26]:
#creating bins and sizes
bins = [426, 1000, 2000, 5000]
sizes = ['small(<1000)', 'medium(1000-2000)', 'large(>2000)']

school_size = []
maths_scores = []
rdng_scores = []
perc_maths = []
perc_rdng = []
perc_overall = []

#function for collecting data on the schools, based on the size range, taken from school_dict_df
def loop_collector(cut_number1, cut_number2):
    
    for x in range(0, len(schools_dict_df)):
        if (schools_dict_df.iloc[x, 1] > cut_number1) & (schools_dict_df.iloc[x, 1] <= cut_number2):
            school_size.append(schools_dict_df.iloc[x, 1])
            maths_scores.append(schools_dict_df.iloc[x, 4])
            rdng_scores.append(schools_dict_df.iloc[x, 5])
            maths_perc = schools_dict_df.iloc[x, 6]
            reading_perc = schools_dict_df.iloc[x, 7]
            overall_perc = schools_dict_df.iloc[x, 8]
            perc_maths.append(schools_dict_df.iloc[x, 6])
            perc_rdng.append(schools_dict_df.iloc[x, 7])
            perc_overall.append(overall_perc)
    
            
#calling the function for the different ranges 
loop_collector(426, 1000)
loop_collector(1000, 2000)
loop_collector(2000, 5000)

#creating a series (which will form columns) for each category of value
series_school_size_list = pd.Series(np.array(school_size), dtype='float64')
series_maths_avgs = pd.Series(np.array(maths_scores), dtype='float64')
series_reading_avgs = pd.Series(np.array(rdng_scores), dtype='float64')
series_maths_percs = pd.Series(np.array(perc_maths), dtype='float64')
series_reading_percs = pd.Series(np.array(perc_rdng), dtype='float64')
series_overall_percs = pd.Series(np.array(perc_overall), dtype='float64')

#creating a df of all the different series
spending_ranges_df = pd.DataFrame({'schoolSize': series_school_size_list, 'averageMathScore': series_maths_avgs, 'averageReadingScore': series_reading_avgs, 'percentPassingMath': series_maths_percs, 'percentPassingReading': series_reading_percs, 'percentPassingOverall': series_overall_percs})
#cutting the df into bins
spending_ranges_df['schoolSize'] = pd.cut(spending_ranges_df['schoolSize'], bins, labels = sizes)
#creating a groupby for school size and finding the avg of all the schools 
spending_ranges_df.groupby('schoolSize').mean()

Unnamed: 0_level_0,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
schoolSize,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
small(<1000),83.821598,83.929843,93.550225,96.099437,89.883853
medium(1000-2000),83.374684,83.864438,93.599695,96.79068,90.621535
large(>2000),77.746417,81.344493,69.963361,82.766634,58.286003


## Scores by School Type

* Perform the same operations as above, based on school type

In [27]:
#lists for the different variables
school_type = []
maths_scores = []
rdng_scores = []
perc_maths = []
perc_rdng = []
perc_overall = []

#for loop to append to lists, from school_dict_df
for x in range(0, len(schools_dict_df)):
    school_type.append(schools_dict_df.iloc[x, 0])
    maths_scores.append(schools_dict_df.iloc[x, 4])
    rdng_scores.append(schools_dict_df.iloc[x, 5])
    maths_perc = schools_dict_df.iloc[x, 6]
    reading_perc = schools_dict_df.iloc[x, 7]
    overall_perc = schools_dict_df.iloc[x, 8]
    perc_maths.append(schools_dict_df.iloc[x, 6])
    perc_rdng.append(schools_dict_df.iloc[x, 7])
    perc_overall.append(overall_perc)
    
#series to form the columns from the above lists
series_school_type_list = pd.Series(np.array(school_type), dtype='object')
series_maths_avgs = pd.Series(np.array(maths_scores), dtype='float64')
series_reading_avgs = pd.Series(np.array(rdng_scores), dtype='float64')
series_maths_percs = pd.Series(np.array(perc_maths), dtype='float64')
series_reading_percs = pd.Series(np.array(perc_rdng), dtype='float64')
series_overall_percs = pd.Series(np.array(perc_overall), dtype='float64')

#creating a df from the series
school_types_df = pd.DataFrame({'schoolType': series_school_type_list, 'averageMathScore': series_maths_avgs, 'averageReadingScore': series_reading_avgs, 'percentPassingMath': series_maths_percs, 'percentPassingReading': series_reading_percs, 'percentPassingOverall': series_overall_percs})
#creating a groupby object, finding the avg for each school type
grouped_school_types_df = school_types_df.groupby('schoolType')
grouped_school_types_df.mean()

Unnamed: 0_level_0,averageMathScore,averageReadingScore,percentPassingMath,percentPassingReading,percentPassingOverall
schoolType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,90.432244
District,76.956733,80.966636,66.548453,80.799062,53.672208
