In [23]:
import pandas as pd
import numpy as np

In [24]:
# 1: Loading data into python 
df = pd.read_csv("School_Learning_Modalities.csv")
df.head()

Unnamed: 0,District NCES ID,District Name,Week,Learning Modality,Operational Schools,Student Count,City,State,ZIP Code
0,100005,Albertville City,08/28/2022 12:00:00 AM,In Person,6,5824.0,Albertville,AL,35950
1,100006,Marshall County,08/28/2022 12:00:00 AM,In Person,15,5764.0,Guntersville,AL,35976
2,100007,Hoover City,08/28/2022 12:00:00 AM,In Person,18,14061.0,Hoover,AL,35243
3,100008,Madison City,08/28/2022 12:00:00 AM,In Person,11,11695.0,Madison,AL,35758
4,100011,Leeds City,08/28/2022 12:00:00 AM,In Person,4,2076.0,Leeds,AL,35094


In [25]:
# 2: Print count of columns and rows 
df.shape

(781148, 9)

In [26]:
# 3: Provide print out of column names
df.columns.values

array(['District NCES ID', 'District Name', 'Week', 'Learning Modality',
       'Operational Schools', 'Student Count', 'City', 'State',
       'ZIP Code'], dtype=object)

In [27]:
# 4: Clean the column names 
df.columns = df.columns.str.replace('[^A-Za-z0-9]+', '_')
df.columns = df.columns.str.lower()
df.columns.values

array(['district_nces_id', 'district_name', 'week', 'learning_modality',
       'operational_schools', 'student_count', 'city', 'state',
       'zip_code'], dtype=object)

In [28]:
# 5: Clean strings that might exist within each column 
# 6: Access white space or special characters
df['district_name'] = df['district_name'].str.replace('[^A-Za-z0-9]+', '_')
df['district_name'] = df['district_name'].str.lower()

df['learning_modality'] = df['learning_modality'].str.replace('[^A-Za-z0-9]+', '_') 
df['learning_modality'] = df['learning_modality'].str.lower()

df['city'] = df['city'].str.replace('[^A-Za-z0-9]+', '_')
df['city'] = df['city'].str.lower()

In [29]:
# 7: Convert column type to correct type
df = df.astype({'district_name': 'string', 
          'week': 'datetime64[ns]', 
          'learning_modality':'string', 
          'city': 'string', 
          'state': 'string'})
df.dtypes

district_nces_id                int64
district_name                  string
week                   datetime64[ns]
learning_modality              string
operational_schools             int64
student_count                 float64
city                           string
state                          string
zip_code                        int64
dtype: object

In [30]:
# 8: Look for and remove duplicate rows
df.drop_duplicates()

Unnamed: 0,district_nces_id,district_name,week,learning_modality,operational_schools,student_count,city,state,zip_code
0,100005,albertville_city,2022-08-28,in_person,6,5824.0,albertville,AL,35950
1,100006,marshall_county,2022-08-28,in_person,15,5764.0,guntersville,AL,35976
2,100007,hoover_city,2022-08-28,in_person,18,14061.0,hoover,AL,35243
3,100008,madison_city,2022-08-28,in_person,11,11695.0,madison,AL,35758
4,100011,leeds_city,2022-08-28,in_person,4,2076.0,leeds,AL,35094
...,...,...,...,...,...,...,...,...,...
781143,5900187,hannahville_indian_school,2021-08-01,in_person,1,163.0,wilson,BI,49896
781144,5900190,tiospaye_topa_school,2021-08-01,in_person,1,157.0,laplant,BI,57652
781145,5900193,seba_dalkai_boarding_school,2021-08-01,hybrid,1,65.0,winslow,BI,86047
781146,5900197,noli_school,2021-08-01,in_person,1,123.0,san_jacinto,BI,92581


In [31]:
# 9: Count number of missing values per column 
df.isnull().sum()

district_nces_id         0
district_name            0
week                     0
learning_modality        0
operational_schools      0
student_count          718
city                     0
state                    0
zip_code                 0
dtype: int64

In [32]:
# 10: create new column "modality_inperson"
result = []

def modality(mode): 
    if mode == 'in_person':
        return True
    else: 
        return False

for col in df['learning_modality']: 
    result.append(modality(col))
# print(result[:5])
df['modality_inperson'] = result

In [33]:
df

Unnamed: 0,district_nces_id,district_name,week,learning_modality,operational_schools,student_count,city,state,zip_code,modality_inperson
0,100005,albertville_city,2022-08-28,in_person,6,5824.0,albertville,AL,35950,True
1,100006,marshall_county,2022-08-28,in_person,15,5764.0,guntersville,AL,35976,True
2,100007,hoover_city,2022-08-28,in_person,18,14061.0,hoover,AL,35243,True
3,100008,madison_city,2022-08-28,in_person,11,11695.0,madison,AL,35758,True
4,100011,leeds_city,2022-08-28,in_person,4,2076.0,leeds,AL,35094,True
...,...,...,...,...,...,...,...,...,...,...
781143,5900187,hannahville_indian_school,2021-08-01,in_person,1,163.0,wilson,BI,49896,True
781144,5900190,tiospaye_topa_school,2021-08-01,in_person,1,157.0,laplant,BI,57652,True
781145,5900193,seba_dalkai_boarding_school,2021-08-01,hybrid,1,65.0,winslow,BI,86047,False
781146,5900197,noli_school,2021-08-01,in_person,1,123.0,san_jacinto,BI,92581,True
