In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

In [None]:
# Reading District info dataset
district = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
display(district.isnull().sum())
print(district.shape)

In [None]:
# Making a copy
district_df1=district.copy()
# # drop NAN values in state column
district_df1['state'].fillna('NIL',inplace=True)
district_df1.head()

In [None]:
# writng a function to get the filepath for specific state
def get_state_id(df, state_col, ele, district_col):
    df = df.groupby(state_col)
    df= df.get_group(ele)
    lst = df[district_col].astype('str').to_list()
    # allocating a filepath
    path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'
    # allocating a filetype
    filetype='.csv'
    # concatenate strings of full filepath
    lst=[path+filename+filetype for filename in lst]
    return lst

In [None]:
# Writing Function to replace month to 3 character month
def replace_month(data,month):
    month_dict={
               "01":"Jan",
               "02":"Feb",
               "03":"Mar",
               "04":"Apr",
               "05":"May",
               "06":"Jun",
               "07":"Jul",
               "08":"Aug",
               "09":"Sep",
               "10":"Oct",
               "11":"Nov",
               "12":"Dec"}
    
    for key,value in month_dict.items():
        if month == key:
            data.replace(to_replace=month,value=value,inplace=True)
    return data

In [None]:
# Frequency of State count
district_state=pd.DataFrame(district.groupby(['state'])['district_id'].nunique().sort_values(ascending=False)).reset_index()
district_state.rename(columns={'state':'State','district_id':'Count'},inplace=True)
district_state

In [None]:
# Creating a list to store district count
count_lst=[]

# Function to get the number of districts in each state in dataset
def get_state_district_count(state):
    lst = get_state_id(district_df1,'state',state,'district_id')
    
    # Reading the files Connecticut
    path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
    all_files = glob.glob(path + "/*.csv")

    li = []

    for filename in all_files:
        if filename in lst:
            df = pd.read_csv(filename, index_col=None, header=0)
            district_id = filename.split("/")[4].split(".")[0]
            df["district_id"] = district_id
            li.append(df)
    
    engagement_df = pd.concat(li)
    engagement_df = engagement_df.reset_index(drop=True)
    engage_df=engagement_df.copy()
    # Drop unneccessary columns first
    engage_df.drop(['lp_id','engagement_index','district_id'],axis=1,inplace=True)
    # Drop NAN rows for pct_access
    engage_df.dropna(axis=0,inplace=True)
    # obataining the district count
    count=engage_df.shape[0]
    # appending result in the list
    count_lst.append(count)
    return count_lst                                              

In [None]:
# List of States in district_info.csv
district_state_lst=district_state['State'].to_list()

# List of count of district in each State
for state in district_state_lst:
    get_state_district_count(state)

# Creating DataFrame to store the data extracted
district_count_df = pd.DataFrame(list(zip(district_state_lst,count_lst)),columns=['State','District Count']).sort_values(by=['District Count'],ascending=False)
# top 10 state based on district count
top_ten_state_district=district_count_df[:10].State.to_list()

In [None]:
# top 10 States based on state count
top_ten_state=district_state.iloc[:10]
top_ten_state=top_ten_state.State.to_list()

# Append the element in top_ten_state_district but not in top_ten_state
for state in top_ten_state_district:
    if state not in top_ten_state:
        top_ten_state.append(state)
print(top_ten_state)

Previously, it was mentioned that we will be dropping the data with missing State values as it will be challenging to fill up the information accurately. This will leave us with 176 districts. In an attempt to look for any difference to the general trend observed previously, we will observe the behaviour of the top 10 States based on district count and state count in the dataset provided (shown in list above), comprising of 152 districts. This sample will attempt to answer the hypothesis of <b><I>whether the difference in State practices had affected the trend starting from July 2020</I></b>.

In [None]:
def get_graph(state):
    lst = get_state_id(district_df1,'state',state,'district_id')
    
    # Reading the files Connecticut
    path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
    all_files = glob.glob(path + "/*.csv")

    li = []

    for filename in all_files:
        if filename in lst:
            df = pd.read_csv(filename, index_col=None, header=0)
            district_id = filename.split("/")[4].split(".")[0]
            df["district_id"] = district_id
            li.append(df)
    
    engagement_df = pd.concat(li)
    engagement_df = engagement_df.reset_index(drop=True)
    
    engage_df=engagement_df.copy()
    # Drop unneccessary columns first
    engage_df.drop(['lp_id','engagement_index','district_id'],axis=1,inplace=True)
    # Drop NAN rows for pct_access
    engage_df.dropna(axis=0,inplace=True)
    
    # Creating another column to extract out the month from the given date
    engage_df['month'] = engage_df['time'].str[5:7] # Slicing out the month
    
    # Replacing the month value
    lst=["01","02","03","04","05","06","07","08","09","10","11","12"]
    for month in lst:
        replace_month(engage_df,month)
        
    # Daily mean and median
    df1=engage_df.copy()
    df1=df1.groupby(['time']).mean()
    df2=engage_df.copy()
    df2=df2.groupby(['time']).median()
    
    #Monthly mean and median
    df3=engage_df.copy()
    df3=df3.groupby(['month'],sort=False).mean()
    df4=engage_df.copy()
    df4=df4.groupby(['month'],sort=False).median()
    
    # Plotting Graphs that have removed the State with Missing Values From District Info Dataset
    fig,axes = plt.subplots(2,2,figsize=(16,10))
    sns.lineplot(ax=axes[0,0],data=df1,x='time',y='pct_access').xaxis.set_ticks([]) # daily mean
    sns.lineplot(ax=axes[0,1],data=df2,x='time',y='pct_access').xaxis.set_ticks([]) # daily median
    sns.lineplot(ax=axes[1,0],data=df3,x='month',y='pct_access') # monthly mean
    sns.lineplot(ax=axes[1,1],data=df4,x='month',y='pct_access') # monthly median
    axes[0,0].set_title("Daily Mean")
    axes[0,1].set_title("Daily Median")
    axes[1,0].set_title("Monthly Mean")
    axes[1,1].set_title("Monthly Median")
    
    fig.suptitle("Observation for "+ str(state), fontsize=30)
    
    return plt.show()

In [None]:
# obtaining graphs for all the top 10 States
for state in top_ten_state:
    get_graph(state=state)

In our hypothesis, we assumed that the students engagement with learning tools and digital platforms were solely due to education methods that were switched online due to the Coronavirus mitigation measures. Hence, we hypothesized that the trend reversal observed near the period July 2020 (Monthly Mean) and August 2020 (Monthly Median) may be due to differing practices by different states, especially for states with high districts frequencies in this dataset (namely Utah and California) which made it mandotory for students to access their learning remotely. Due to their high frequencies in the dataset provided, it results in a trend reveral. 

From the results obtain through this sample, it seems that there are insufficient evidence in supporting our hypothesis that <b><i>the difference in State practices had affected the trend starting from July 2020</i></b>. The trends above show similarity to the trend observed on the national level. Contrary to our expextation,even trends like the Massachuset, which we expect it to behave in a continuous declining trend based on our assumption and hypothesis, behaved in a similar manner to the trend observed on the national level. The only discrepancy observed from this sample to the national level trend, is the trend for the monthly mean occur only in August 2020 instead of July 2020. However, this discrepancy spotted is within our expectation since the pattern had already been picked up on the monthly median graph for the trend observed on national level.
    
All in all, the results obtained suggest that the political factor (difffering practices amongst the States) may not be a cause for the increase in engagement of learning tools and digital platforms amongst the students in the United States. This still leave us the intriguing observation starting from the period August 2020. Deeper analysis may be needed to be done to find out the answer for this observation. Perhaps, engaging other features in the dataset may provide us with the answer we are looking for.....