In [None]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# To load files
school_complete_load = Path("Resources\schools_complete.csv")
student_complete_load = Path("Resources\students_complete.csv")

# To read files 
school_data = pd.read_csv(school_complete_load)
student_data = pd.read_csv(student_complete_load)

In [3]:
# Merge 
school_data_merged = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_merged

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,School ID,type,size,budget
0,0,Paul Bradley,M,9,Huang High School,96,94,0,Government,2917,1910635
1,1,Victor Smith,M,12,Huang High School,90,43,0,Government,2917,1910635
2,2,Kevin Rodriguez,M,12,Huang High School,41,76,0,Government,2917,1910635
3,3,Richard Scott,M,12,Huang High School,89,86,0,Government,2917,1910635
4,4,Bonnie Ray,F,9,Huang High School,87,69,0,Government,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12,Thomas High School,51,48,14,Independent,1635,1043130
39166,39166,Dawn Bell,F,10,Thomas High School,81,89,14,Independent,1635,1043130
39167,39167,Rebecca Tanner,F,9,Thomas High School,99,99,14,Independent,1635,1043130
39168,39168,Desiree Kidd,F,10,Thomas High School,72,77,14,Independent,1635,1043130


In [4]:
# Number of unique schools
total_unique_schools = school_data_merged['school_name'].nunique()

# Number of students 
total_students = len(school_data_merged)

# Budget 
total_budget = school_data['budget'].sum()

# Mean maths score 
mean_maths_score = school_data_merged['maths_score'].mean()

# Mean reading scores 
mean_reading_score = school_data_merged['reading_score'].mean()

# Percentage students who passed maths 
students_passed_maths = school_data_merged[school_data_merged['maths_score']>=50]
percentage_passed_math = (len(students_passed_maths)/total_students) * 100 

# Percentage students who passed reading 
students_passed_reading = school_data_merged[school_data_merged['reading_score']>=50]
percentage_passed_reading = (len(students_passed_reading)/total_students) * 100

# Percentage of overall passing (both maths and reading)
students_passed_both = school_data_merged[(school_data_merged['maths_score']>=50) & (school_data_merged['reading_score']>=50)]
percentage_passed_both = (len(students_passed_both)/total_students) * 100 

LGA_summary = pd.DataFrame({
    'Total Schools': '{:,}'.format(total_unique_schools),
    'Total Students': [total_students],
    'Total Budget': ['${:,.2f}'.format(total_budget)],
    'Average Maths Score': ['{:,.2f}%'.format(mean_maths_score)],
    'Average Reading Score': ['{:,.2f}%'.format(mean_reading_score)],
    '% Passing Maths': ['{:,.2f}%'.format(percentage_passed_math)],
    '% Passing Reading': ['{:,.2f}%'.format(percentage_passed_reading)],
    '% Overall Passing': ['{:,.2f}%'.format(percentage_passed_both)]
})

LGA_summary


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",70.34%,69.98%,86.08%,84.43%,72.81%


In [24]:
def percentage_passed(series):
    return(series>= 50).mean() * 100 

school_data_merged['% Passing Maths'] = percentage_passed(school_data_merged['maths_score'])
school_data_merged['% Passing Reading'] = percentage_passed(school_data_merged['reading_score'])

grouped_school_data = school_data_merged.groupby('school_name').agg({
    'type': 'first',  
    'Student ID': 'count',  
    'budget': 'first',  
    'maths_score': 'mean',  
    'reading_score': 'mean', 
    '% Passing Maths': lambda x: percentage_passed(x),
    '% Passing Reading': lambda x: percentage_passed(x)
})

# Rename columns for clarity
grouped_school_data.rename(columns={
    'type': 'School Type',
    'Student ID': 'Total Students',
    'budget': 'Total School Budget',
    'maths_score': 'Average Maths Score',
    'reading_score': 'Average Reading Score',
}, inplace=True)

# Calculate per student budget
grouped_school_data['Per Student Budget'] = grouped_school_data['Total School Budget'] / grouped_school_data['Total Students']

overall_passing_per_school = ((grouped_school_data['% Passing Maths'] >= 50) & (grouped_school_data['% Passing Reading'] >= 50)).astype(int) * 100
grouped_school_data['% Overall Passing'] = overall_passing_per_school


# Reset the index to make the school_name a separate column
grouped_school_data.reset_index(inplace=True)

# Display the School Summary DataFrame
school_summary = grouped_school_data[['school_name', 'School Type', 'Total Students', 'Total School Budget', 'Per Student Budget',
                                      'Average Maths Score', 'Average Reading Score', '% Passing Maths',
                                      '% Passing Reading', '% Overall Passing']]
school_summary.rename(columns={'school_name': 'School Name'}, inplace=True)
school_summary


Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,Bailey High School,Government,4976,3124928,628.0,86.078632,84.426857,100.0,100.0,100
1,Cabrera High School,Independent,1858,1081356,582.0,86.078632,84.426857,100.0,100.0,100
2,Figueroa High School,Government,2949,1884411,639.0,86.078632,84.426857,100.0,100.0,100
3,Ford High School,Government,2739,1763916,644.0,86.078632,84.426857,100.0,100.0,100
4,Griffin High School,Independent,1468,917500,625.0,86.078632,84.426857,100.0,100.0,100
5,Hernandez High School,Government,4635,3022020,652.0,86.078632,84.426857,100.0,100.0,100
6,Holden High School,Independent,427,248087,581.0,86.078632,84.426857,100.0,100.0,100
7,Huang High School,Government,2917,1910635,655.0,86.078632,84.426857,100.0,100.0,100
8,Johnson High School,Government,4761,3094650,650.0,86.078632,84.426857,100.0,100.0,100
9,Pena High School,Independent,962,585858,609.0,86.078632,84.426857,100.0,100.0,100
