In [1]:
# import required dependencies
import pandas as pd
import os

In [3]:
# Create the path and import the data
full_student_data = os.path.join('Resources/new_full_student_data.csv')
student_df = pd.read_csv(full_student_data)

In [4]:
# Verify that the data was properly imported
student_df.head()

Unnamed: 0,student_id,student_name,grade,school_name,reading_score,math_score,school_type,school_budget
0,103880842,Travis Martin,9th,Sullivan High School,59.0,88.2,Public,961125
1,45069750,Michael Brown,9th,Dixon High School,94.7,73.5,Charter,870334
2,45024902,Gabriela Lucero,9th,Wagner High School,89.0,70.4,Public,846745
3,62582498,Susan Richardson,9th,Silva High School,69.7,80.3,Public,991918
4,16437227,Sherry Davis,11th,Bowers High School,,27.5,Public,848324


In [7]:
# Check for null values
student_df.isna().sum()

student_id          0
student_name        0
grade               0
school_name         0
reading_score    1968
math_score        982
school_type         0
school_budget       0
dtype: int64

In [8]:
# Drop rows with null values and verify removal
student_df = student_df.dropna()
student_df.isna().sum()

student_id       0
student_name     0
grade            0
school_name      0
reading_score    0
math_score       0
school_type      0
school_budget    0
dtype: int64

In [9]:
# Check for duplicate rows
student_df.duplicated().sum()

1836

In [10]:
# Drop duplicate rows and verify removal
student_df = student_df.drop_duplicates()
student_df.duplicated().sum()

0

In [11]:
# Check data types
student_df.dtypes

student_id         int64
student_name      object
grade             object
school_name       object
reading_score    float64
math_score       float64
school_type       object
school_budget      int64
dtype: object

In [15]:
# Examine the grade column to understand why it is not an int
student_df['grade']

0         9th
1         9th
2         9th
3         9th
5         9th
         ... 
19508    10th
19509    12th
19511    11th
19512    11th
19513    12th
Name: grade, Length: 14831, dtype: object

In [20]:
# Remove 'th' suffixes by replacing with and empty string
student_df['grade'] = student_df['grade'].str.replace('th', '')

In [21]:
# Remove the non-numeric characters and verify the contents of the column
student_df['grade']

0         9
1         9
2         9
3         9
5         9
         ..
19508    10
19509    12
19511    11
19512    11
19513    12
Name: grade, Length: 14831, dtype: object

In [22]:
# Change the grade column to the int type and verify column types
student_df['grade'] = student_df['grade'].astype(int)
student_df.dtypes

student_id         int64
student_name      object
grade              int32
school_name       object
reading_score    float64
math_score       float64
school_type       object
school_budget      int64
dtype: object

In [23]:
student_df.head()

Unnamed: 0,student_id,student_name,grade,school_name,reading_score,math_score,school_type,school_budget
0,103880842,Travis Martin,9,Sullivan High School,59.0,88.2,Public,961125
1,45069750,Michael Brown,9,Dixon High School,94.7,73.5,Charter,870334
2,45024902,Gabriela Lucero,9,Wagner High School,89.0,70.4,Public,846745
3,62582498,Susan Richardson,9,Silva High School,69.7,80.3,Public,991918
5,74579444,Cynthia Johnson,9,Montgomery High School,63.5,76.9,Charter,893368


In [24]:
# Display summary statistics for the DataFrame
student_df.describe()

Unnamed: 0,student_id,grade,reading_score,math_score,school_budget
count,14831.0,14831.0,14831.0,14831.0,14831.0
mean,69752960.0,10.355539,72.357865,64.675733,893742.749107
std,34529090.0,1.097728,15.22459,15.844093,53938.066467
min,10009060.0,9.0,10.5,3.7,817615.0
25%,39844330.0,9.0,62.2,54.5,846745.0
50%,69659780.0,10.0,73.8,65.3,893368.0
75%,99274490.0,11.0,84.0,76.0,956438.0
max,129999700.0,12.0,100.0,100.0,991918.0


In [25]:
# Display the mean math score using the mean function
student_df.mean()

  


student_id       6.975296e+07
grade            1.035554e+01
reading_score    7.235787e+01
math_score       6.467573e+01
school_budget    8.937427e+05
dtype: float64

In [28]:
# Store the minimum reading score as min_reading_score
min_reading_score = student_df['reading_score'].min()
min_reading_score

10.5

In [46]:
# Use loc to display the grade column
student_df.loc[0:19513, ['grade']]

Unnamed: 0,grade
0,9
1,9
2,9
3,9
5,9
...,...
19508,10
19509,12
19511,11
19512,11


In [42]:
# Use `iloc` to display the first 3 rows and columns 3, 4, and 5.
student_df.iloc[:3, [3,4,5]]

Unnamed: 0,school_name,reading_score,math_score
0,Sullivan High School,59.0,88.2
1,Dixon High School,94.7,73.5
2,Wagner High School,89.0,70.4


In [139]:
# Select the rows for grade nine and display their summary statistics using `loc` and `describe`.


In [82]:
# Store the row with the minimum overall reading score as `min_reading_row`
# using `loc` and the `min_reading_score` found in Deliverable 3.
min_reading_row = student_df.loc[student_df["reading_score"] == min_reading_score]
min_reading_score = student_df["reading_score"].min()
min_reading_row

Unnamed: 0,student_id,student_name,grade,school_name,reading_score,math_score,school_type,school_budget
3706,81758630,Matthew Thomas,10,Dixon High School,10.5,58.4,Charter,870334


In [154]:
# Use loc with conditionals to select all reading scores from 10th graders at Dixon High School.


In [142]:
# Find the mean reading score for all students in grades 11 and 12 combined.


In [141]:
# Use groupby and mean to find the average reading and math scores for each school type.
avg_student_scores_by_grade = student_df.groupby(by='school_type').mean()
avg_student_scores_by_grade.loc[:, ["school_budget"]]

Unnamed: 0_level_0,school_budget
school_type,Unnamed: 1_level_1
Charter,872625.656236
Public,911195.558251


In [107]:
# Use the `groupby`, `count`, and `sort_values` functions to find the
# total number of students at each school and sort from most students to least students.


In [112]:
# Find the average math score by grade for each school type by using the groupby and mean functions.
avg_by_school = student_df.groupby(['school_type', 'grade']).mean()
avg_by_school.loc[:, ["math_score"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,math_score
school_type,grade,Unnamed: 2_level_1
Charter,9,70.077874
Charter,10,66.443206
Charter,11,68.024735
Charter,12,60.212121
Public,9,63.771066
Public,10,63.764121
Public,11,59.314337
Public,12,63.568319
