# Import Data

In [None]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load
school_file = "../UTAMCB201904DATA3/04-Pandas/Homework/Instructions/PyCitySchools/Resources/schools_complete.csv"
student_file = "../UTAMCB201904DATA3/04-Pandas/Homework/Instructions/PyCitySchools/Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_file)
student_data = pd.read_csv(student_file)

# Combine the data into a single dataset
mydata = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

# Calc and Add Per Student Budget column
mydata["Per Student Budget"] = mydata["budget"]/mydata["size"]


# District Summary

In [None]:
# Calc District Summary
TtlSchoolCnt = school_data["school_name"].count()
TtlStudentsCnt = student_data["student_name"].count()
TtlBudget = school_data["budget"].sum()
AvgMathScore = student_data["math_score"].mean()
AvgReadScore = student_data["reading_score"].mean()
PcntPassMath = (student_data[student_data.math_score >= 70]["math_score"].count() /TtlStudentsCnt) *100
PcntPassReading = (student_data[student_data.reading_score >= 70]["reading_score"].count() /TtlStudentsCnt) *100
OverallPassRate = (PcntPassMath + PcntPassReading)/2

# Create a dataframe with results
TtlResults = pd.DataFrame({
    "Total Schools": [TtlSchoolCnt],
    "Total Students": [TtlStudentsCnt],
    "Total Budget": [TtlBudget],
    "Average Math Score": [AvgMathScore],
    "Average Reading Score": [AvgReadScore],
    "% Passing Math": [PcntPassMath],
    "% Passing Reading": [PcntPassReading],
    "% Overall Passing Rate": [OverallPassRate]
})

# Format columns
TtlResults["Total Students"] = TtlResults["Total Students"].map("{:,}".format)
TtlResults["Total Budget"] = TtlResults["Total Budget"].map("${:,.2f}".format)
TtlResults

# School Summary

In [None]:
# Calc School Summary
# [Dateframe 1] Group data by school name and get count, averages etc
schSummary = mydata.groupby(
   ['school_name']
).agg(
    {
         'Student ID':'count',
         'budget': 'first',
         'reading_score': 'mean',
         'math_score': 'mean',
         'Per Student Budget': 'first'
    }
)

# Format columns
schSummary["Per Student Budget"] = schSummary["Per Student Budget"].map("${:,.2f}".format)
schSummary["budget"] = schSummary["budget"].map("${:,.2f}".format)

# [Series 1] Calc Percent passed in math and create a series with school name and percent passed
schMathPass = mydata[mydata['math_score']>=70].groupby(['school_name']).count()
schMathPass = schMathPass["math_score"]

# [Series 2] Calc Percent passed in reading and create a series with school name and percent passed
schReadPass = mydata[mydata['reading_score']>=70].groupby(['school_name']).count()
schReadPass = schReadPass["reading_score"]

# Combine [Dataframe 1], [Series 1] and [Series 2] into one Dataframe
SummyAndMath = pd.merge(schSummary, schMathPass, on='school_name')
SummyAndMathAndRead = pd.merge(SummyAndMath, schReadPass, on='school_name')
SummyAndMathAndRead['math_score_y'] = (SummyAndMathAndRead["math_score_y"]/SummyAndMathAndRead["Student ID"]) *100
SummyAndMathAndRead['reading_score_y'] = (SummyAndMathAndRead["reading_score_y"]/SummyAndMathAndRead["Student ID"]) * 100

# Rename columns
SummyAndMathAndRead.columns = ['Total Students','Total School Budget','Average Reading Score',
                     'Average Math Score','Per Student Budget','% Passing Math', '% Passing Reading']

# Calc overall pass rate
SummyAndMathAndRead['% Overall Passing Rate'] = (SummyAndMathAndRead['% Passing Math'] + SummyAndMathAndRead['% Passing Reading'])/2

# Create a data frame with schoolname and type and then add type to results dataframe
schtype = school_data[['school_name','type']]
schSummyResults = pd.merge(SummyAndMathAndRead, schtype, on='school_name')

# Set school_name as index
schSummyResults = schSummyResults.set_index('school_name')

# Rename column
schSummyResults = schSummyResults.rename(columns={'type':'School Type'})

# Arrange columns
schSummyResults = schSummyResults[['School Type','Total Students','Total School Budget','Average Reading Score',
                     'Average Math Score','Per Student Budget','% Passing Math', '% Passing Reading', '% Overall Passing Rate']]
# Remove index name
schSummyResults.index.name=""


# Top Performing Schools (By Overall Passing Rate)

In [None]:
#Top Performing Schools (By Passing Rate)
schSummyResults.sort_values(by='% Overall Passing Rate', ascending=False)[:5]

# Bottom Performing Schools (By Overall Passing Rate)

In [None]:
# Bottom Performing Schools (By Passing Rate)
schSummyResults.sort_values(by='% Overall Passing Rate', ascending=True)[:5]

# Math Scores by Grade

In [None]:
# Group by School Name and Grage and calculate math score average
mathSummary = student_data.groupby(['school_name','grade']).agg({'math_score':'mean'})

# Change rows into columns
mathSummary = mathSummary.unstack()

# Convert to Dataframe
mathSummary = pd.DataFrame(mathSummary.to_records())

# Set index
mathSummary = mathSummary.set_index("school_name")

# Empty index name
mathSummary.index.name=""

# Rename columns
mathSummary = mathSummary.rename(columns={
    "('math_score', '10th')":'10th',
    "('math_score', '11th')":'11th',
    "('math_score', '12th')":'12th',
    "('math_score', '9th')":'9th'
})

# Rearrange columns
mathSummary = mathSummary[['9th','10th','11th', '12th']]
mathSummary

# Reading Score by Grade

In [None]:
# Group by School Name and Grage and calculate reading score average
readSummary = student_data.groupby(['school_name','grade']).agg({'reading_score':'mean'})

# Change rows into columns
readSummary = readSummary.unstack()

# Convert to Dataframe
readSummary = pd.DataFrame(readSummary.to_records())

# Set index
readSummary = readSummary.set_index("school_name")

# Empty index name
readSummary.index.name=""

# Rename columns
readSummary = readSummary.rename(columns={
    "('reading_score', '10th')":'10th',
    "('reading_score', '11th')":'11th',
    "('reading_score', '12th')":'12th',
    "('reading_score', '9th')":'9th'
})

# Rearrange columns
readSummary = readSummary[['9th','10th','11th', '12th']]
readSummary

# Scores by School Spending

In [None]:
# Create bins and group names
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]

# Add bin colum to the dataset
mydata["Bins"] = pd.cut(mydata["Per Student Budget"], spending_bins, labels=group_names)

# [Dataframe 1]Group by bin and calculate metrics
binSummary = mydata.groupby('Bins').agg({
    'Student ID':'count',
    'math_score':'mean',
    'reading_score':'mean'
})

# [Series 1] Calc Percent passed in math and create a series with school name and percent passed
binschMathPass = mydata[mydata['math_score']>=70].groupby(['Bins']).count()
binschMathPass = binschMathPass["math_score"]

# [Series 1] Calc Percent passed in math and create a series with school name and percent passed
binschReadPass = mydata[mydata['reading_score']>=70].groupby(['Bins']).count()
binschReadPass = binschReadPass["reading_score"]

# Combine [Dataframe 1], [Series 1] and [Series 2] into one Dataframe
binresults = pd.merge(binSummary, binschMathPass, on='Bins')
binSummyResults = pd.merge(binresults, binschReadPass, on='Bins')
binSummyResults['math_score_y'] = (binSummyResults["math_score_y"]/binSummyResults["Student ID"]) *100
binSummyResults['reading_score_y'] = (binSummyResults["reading_score_y"]/binSummyResults["Student ID"]) * 100

# Rename columns and calculate overall pass rate
binSummyResults.columns = ['Student ID','Average Math Score',
                     'Average Reading Score','% Passing Math', '% Passing Reading']
binSummyResults['% Overall Passing Rate'] = (binSummyResults['% Passing Math'] + binSummyResults['% Passing Reading'])/2

# Set index name
binSummyResults.index.name="Spending Ranges (Per Student)"

# Rearrange columns
binSummyResults = binSummyResults[['Average Math Score','Average Reading Score','% Passing Math', '% Passing Reading', '% Overall Passing Rate']]
binSummyResults


# Scores by School Size

In [None]:
# Create bins and groups
size_bins = [0, 1000, 2000, 5000]
size_group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

# Add Size Bins column to the dataset
mydata["Size Bins"] = pd.cut(mydata["size"], size_bins, labels=size_group_names)

# [Dataframe 1] Group by Size Bins and calculated metrics
sizebinSummary = mydata.groupby('Size Bins').agg({
    'Student ID':'count',
    'math_score':'mean',
    'reading_score':'mean'
})

# [Series 1] Calc Percent passed in math and create a series with school name and percent passed
sizebinschMathPass = mydata[mydata['math_score']>=70].groupby(['Size Bins']).count()
sizebinschMathPass = sizebinschMathPass["math_score"]

# [Series 2] Calc Percent passed in reading and create a series with school name and percent passed
sizebinschReadPass = mydata[mydata['reading_score']>=70].groupby(['Size Bins']).count()
sizebinschReadPass = sizebinschReadPass["reading_score"]

# Combine [Dataframe 1], [Series 1] and [Series 2] into one Dataframe
sizebinpreResults = pd.merge(sizebinSummary, sizebinschMathPass, on='Size Bins')
sizebinResults = pd.merge(sizebinpreResults, sizebinschReadPass, on='Size Bins')
sizebinResults['math_score_y'] = (sizebinResults["math_score_y"]/sizebinResults["Student ID"]) *100
sizebinResults['reading_score_y'] = (sizebinResults["reading_score_y"]/sizebinResults["Student ID"]) * 100

# Rename columns and create overall pass rate
sizebinResults.columns = ['Student ID','Average Math Score',
                     'Average Reading Score','% Passing Math', '% Passing Reading']
sizebinResults['% Overall Passing Rate'] = (sizebinResults['% Passing Math'] + sizebinResults['% Passing Reading'])/2

# Set index name
sizebinResults.index.name="School Size"

# Rearrange columns
sizebinResults = sizebinResults[['Average Math Score','Average Reading Score','% Passing Math', '% Passing Reading', '% Overall Passing Rate']]
sizebinResults

# Scores by School Type

In [None]:
# [Dataframe 1] Group by School Type and calculated metrics
schTypeSummary = mydata.groupby(
   ['type']
).agg(
    {
         'Student ID':'count',
         'math_score': 'mean',
         'reading_score': 'mean'        
    }
)

# [Series 1] Calc Percent passed in math and create a series with school name and percent passed
schTypeMathPass = mydata[mydata['math_score']>=70].groupby(['type']).count()
schTypeMathPass = schTypeMathPass["math_score"]

# [Series 2] Calc Percent passed in reading and create a series with school name and percent passed
schTypeReadPass = mydata[mydata['reading_score']>=70].groupby(['type']).count()
schTypeReadPass = schTypeReadPass["reading_score"]

# Combine [Dataframe 1], [Series 1] and [Series 2] into one Dataframe
schTypepreResults = pd.merge(schTypeSummary, schTypeMathPass, on='type')
schTypeResults = pd.merge(schTypepreResults, schTypeReadPass, on='type')
schTypeResults['math_score_y'] = (schTypeResults["math_score_y"]/schTypeResults["Student ID"]) *100
schTypeResults['reading_score_y'] = (schTypeResults["reading_score_y"]/schTypeResults["Student ID"]) * 100

# Rename columns
schTypeResults.columns = ['Total Students','Average Math Score',
                     'Average Reading Score','% Passing Math', '% Passing Reading']

# Add overall Pass Rate
schTypeResults['% Overall Passing Rate'] = (schTypeResults['% Passing Math'] + schTypeResults['% Passing Reading'])/2

# Reorder columns
schTypeResults=schTypeResults[['Average Math Score','Average Reading Score','% Passing Math', 
                            '% Passing Reading', '% Overall Passing Rate']]
# Set index name
schTypeResults.index.name="School Type"
schTypeResults