In [5]:
import pandas as pd

# load cps.csv file into a DataFrame
df = pd.read_csv(r'C:\Users\pranay\OneDrive\Documents\datasets\covidd\cps.csv')

# create new DataFrame with desired columns
new_df = df[['School_ID', 'Short_Name', 'Is_High_School', 'Zip', 'Student_Count_Total', 'College_Enrollment_Rate_School', 'Grades_Offered_All', 'School_Hours']].copy()

# generate new columns for lowest and highest grades offered and starting hour
new_df['Lowest_Grade_Offered'] = new_df['Grades_Offered_All'].str.split('-').str[0]
new_df['Highest_Grade_Offered'] = new_df['Grades_Offered_All'].str.split('-').str[1]
new_df['Starting_Hour'] = new_df['School_Hours'].str.split('-').str[0]

# replace missing numeric values with mean for that column
new_df.fillna(new_df.mean(), inplace=True)

# display first 10 rows of new DataFrame
print(new_df.head(10))




   School_ID             Short_Name  Is_High_School    Zip  \
0     609952                 GREENE           False  60609   
1     609869               LANGFORD           False  60636   
2     609896               DRUMMOND           False  60622   
3     610590  BRONZEVILLE CLASSICAL           False  60609   
4     610087                  BLAIR           False  60638   
5     610503    FRAZIER PROSPECTIVE           False  60624   
6     400164  INSTITUTO - LOZANO HS            True  60608   
7     610059                  MAYER           False  60614   
8     610206                  TWAIN           False  60638   
9     609872                  PEREZ           False  60608   

   Student_Count_Total  College_Enrollment_Rate_School    Grades_Offered_All  \
0                  415                       58.084302        PK,K,1,2,3,4,5   
1                  241                       58.084302  PK,K,1,2,3,4,5,6,7,8   
2                  346                       58.084302  PK,K,1,2,3,4,5,6,7,8 

  new_df.fillna(new_df.mean(), inplace=True)


In [6]:
# calculate mean and standard deviation of College Enrollment Rate for High Schools
high_schools = new_df[new_df['Is_High_School'] == True]
cer_mean = high_schools['College_Enrollment_Rate_School'].mean()
cer_std = high_schools['College_Enrollment_Rate_School'].std()
print(f"Mean of College Enrollment Rate for High Schools: {cer_mean}")
print(f"Standard Deviation of College Enrollment Rate for High Schools: {cer_std}")



Mean of College Enrollment Rate for High Schools: 58.08430232558146
Standard Deviation of College Enrollment Rate for High Schools: 25.068653822753426


In [7]:
# calculate mean and standard deviation of Student_Count_Total for non-High Schools
non_high_schools = new_df[new_df['Is_High_School'] == False]
sct_mean = non_high_schools['Student_Count_Total'].mean()
sct_std = non_high_schools['Student_Count_Total'].std()
print(f"Mean of Student_Count_Total for non-High Schools: {sct_mean}")
print(f"Standard Deviation of Student_Count_Total for non-High Schools: {sct_std}")



Mean of Student_Count_Total for non-High Schools: 521.5450733752621
Standard Deviation of Student_Count_Total for non-High Schools: 268.63577025549176


In [8]:
# display distribution of starting hours for all schools
print(new_df['Starting_Hour'].value_counts())

07:45 AM        66
08:00 AM        63
08:45 AM        60
08:30 AM        43
08:15 AM        38
                ..
07:30 AM         1
7:45             1
07:45 AM         1
8:15/8:45 AM     1
8:15             1
Name: Starting_Hour, Length: 106, dtype: int64


In [9]:
#Number of schools outside of the Loop Neighborhood (i.e., outside of zip codes 60601, 60602, 60603, 60604, 60605, 60606, 60607, and 60616): 
loop_zip_codes = [60601, 60602, 60603, 60604, 60605, 60606, 60607, 60616]
non_loop_zip_count = new_df.loc[~new_df['Zip'].isin(loop_zip_codes)].shape[0]
print('Number of schools outside of the Loop Neighborhood: ', non_loop_zip_count)

Number of schools outside of the Loop Neighborhood:  634
