In [None]:
import numpy as np 
import pandas as pd

import os
import glob

import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.impute import SimpleImputer
from datetime import datetime

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
products = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
districts =pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

In [None]:
path = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
files = glob.glob(path + "/*.csv")
list_of_df = [pd.read_csv(file).assign(district_id=os.path.basename(file).strip(".csv")) for file in files] 

In [None]:
engagement = pd.concat(list_of_df, ignore_index= True)

# Data Cleaning and Preprocessing

* Deal with missing values
* Convert data type 
* Remove punctuations
* Split the sub-categories

In [None]:
districts = districts[districts.state.notna()].reset_index(drop = False)

# Impute the missing values
districts_imputed = districts.copy()
#setting strategy to 'mean' to impute by the mean
mean_imputer = SimpleImputer(strategy='most_frequent')# strategy can also be mean or median 
districts_imputed.iloc[:,:] = mean_imputer.fit_transform(districts_imputed)
districts_imputed.isna().sum()


From a glimpse, we can see the county_connections_ration has 3 values: NaN, [0.18, 1] and [1, 2]. However, [1,2] is only for one data point, so it is not valuable. Thus, we take only [0.18, 1]

In [None]:
districts_imputed = districts_imputed[districts_imputed['county_connections_ratio']=='[0.18, 1[']

In [None]:
# convert datetime type, district_id to int64 (same as districts)

engagement['time'] = pd.to_datetime(engagement['time'])
engagement['district_id']= engagement['district_id'].astype(str).astype(int)

# Impute the missing values
engagement_imputed = engagement.copy()
engagement_imputed['time']= engagement_imputed['time'].apply(lambda x: x.toordinal()) # convert datetime to ordinal to apply Imputer
#setting strategy to 'mean' to impute by the mean
mean_imputer_eng = SimpleImputer(strategy='most_frequent')# strategy can also be mean or median 
engagement_imputed.iloc[:,:]= mean_imputer_eng.fit_transform(engagement_imputed)
engagement_imputed.isna().sum()

In [None]:
#convert back to datetime type.
engagement_imputed['time'] = engagement_imputed['time'].astype(int) # convert to int to apply fromordinal()
engagement_imputed['time']= engagement_imputed['time'].apply(lambda x: datetime.fromordinal(x))

In [None]:
#Remove punctuations
pd.options.mode.chained_assignment = None
for i in [' ', '.']:
    for k in range(len(products)):
        if pd.isna(products['Provider/Company Name'][k]) == False:
            products['Provider/Company Name'][k] = products['Provider/Company Name'][k].strip(i)

#split the sub categories
products['funct_main'] = products['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products['funct_sub'] = products['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products['funct_sub'] = products['funct_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products.drop("Primary Essential Function", axis=1, inplace=True)

# Exploratory Data Analysis

**1. District**

In [None]:
districts_by_state = districts_imputed['state'].value_counts().to_frame().reset_index()
districts_by_state.columns = ['state','count_districts']
#using built-in United States Choropleth Map
us_state = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
districts_by_state['state_abbrev'] = districts_by_state['state'].replace(us_state) #https://stackoverflow.com/questions/40075106/replace-values-in-pandas-series-with-dictionary
fig = go.Figure()
layout = dict(
    title_text = "Number of School Districts per State",
    geo_scope='usa',
)

fig.add_trace(
    go.Choropleth(
        locations=districts_by_state.state_abbrev,
        zmax=1,
        z = districts_by_state.count_districts,
        locationmode = 'USA-states',
        marker_line_color='white',
        geo='geo',
        colorscale = 'BuPu', 
    )
)
            
fig.update_layout(layout)   
fig.show()

plt.figure(figsize = (15, 8))
sns.set_style("white")
a = sns.barplot(data = districts['state'].value_counts().reset_index(), x = 'state', y = 'index', palette='rocket')
plt.xticks([])
plt.yticks(fontname = 'arial', fontsize = 14, color = '#283655')
plt.ylabel('')
plt.xlabel('')
a.spines['left'].set_linewidth(1.5)
for w in ['right', 'top', 'bottom']:
    a.spines[w].set_visible(False)
    
for p in a.patches:
    width = p.get_width()
    plt.text(0.5 + width, p.get_y() + 0.55 * p.get_height(), f'{int(width)}',
             ha = 'center', va = 'center', fontname = 'arial', fontsize = 15, color = '#283655')

plt.show()


As you can see in above plot, the available data does not cover all the states in the U.S. (19/50). The states with the most available school districts are UT (29) and CT (26) while there are also states with only one school district (FL, TN, NY, AZ) ( Github does not support interactive plot so i have to supply a picture)

In [None]:
fig = px.pie(districts_imputed['locale'].value_counts().reset_index().rename(columns = {'locale': 'count'}), values = 'count', names = 'index', width = 650, height = 650)

fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.7, 
                  marker = dict(colors = ['#90afc5','#336b87','#2a3132','#763626'], line = dict(color = 'white', width = 2)))

fig.update_layout(annotations = [dict(text = ' The count of districts <br>in each type <br>of areas', 
                                      x = 0.5, y = 0.5, font_size = 20, showarrow = False, 
                                      font_family = 'arial',
                                      font_color = '#283655')],
                  showlegend = False)
                  
fig.show()

Looking at the count plot, we can see more than 50% school districts located in suburb.


Since values of pct_black/hispanic and pct_free/reduced, pp_total_raw are presented as intervals, so we have to replace them by mean of those intervals. As mentioned above, the county_connections_ratio is the same for every data point, so we drop this column

In [None]:
#replace intervals the the means
for i in ['pct_black/hispanic', 'pct_free/reduced']:
    districts_imputed[i] = districts_imputed[i].apply(lambda x: float(x.split(',')[0][1:]) + 0.1)

districts_imputed['pp_total_raw'] = districts_imputed['pp_total_raw'].apply(lambda x: int(x.split(',')[0][1:]) + 1000)

districts_imputed.drop('county_connections_ratio', axis = 1, inplace = True)

districts_imputed.head(5)

In [None]:
dist_area_group = districts_imputed.groupby('locale').agg({'pct_black/hispanic': 'mean', 'pct_free/reduced': 'mean', 'pp_total_raw': 'mean'}).reset_index()

colors = ['#90afc5', '#336b87', '#763626']

fig = plt.figure(figsize = (12,10))
for i in range(len(dist_area_group.columns.tolist()[1:])):
    plt.subplot(2, 2, i+1)
    sns.set_style("white")
    plt.title(dist_area_group.columns.tolist()[1:][i], size = 16, fontname = 'arial', y = 1.09, color = colors[i])
    plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
    a = sns.barplot(data = dist_area_group, x = 'locale', y = dist_area_group.columns.tolist()[1:][i], color = colors[i])
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'arial', size = 12)
    plt.yticks([])
    
    for j in ['right', 'top', 'left']:
        a.spines[j].set_visible(False)
    for j in ['bottom']:
        a.spines[j].set_linewidth(1.4)
      
    if i < 2:
        for p in a.patches:
            height = p.get_height()
            a.annotate(f'{int(height*100)} %', (p.get_x() + p.get_width() / 2, p.get_height()-0.03), 
                   ha = 'center', va = 'center', 
                   size = 12,
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   color = 'white',
                   fontname = 'arial')
    else:
        for p in a.patches:
            height = p.get_height()
            a.annotate(f'{int(height)} $', (p.get_x() + p.get_width() / 2, p.get_height()-1000), 
                   ha = 'center', va = 'center', 
                   size = 12,
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   color = 'white',
                   fontname = 'arial')
            
plt.figtext(0.07, 1.05, 'Characteristics of school districts by locale', fontsize = 20, fontname = 'arial', color = '#283655')
fig.tight_layout(pad = 3)

plt.show()

As we notice that:

* the largest count of students identified as Black or Hispanic are in cities.
* 50% of the students in cities and towns are eligible for free or reduced-price lunch.
* The highest total expenses per student is in rural area.

**2. Product**

In [None]:
plt.figure(figsize = (15, 8))
plt.title('TOP-15 of learning providers/companies')
a = sns.barplot(data = products['Provider/Company Name'].value_counts().reset_index().head(15), x = 'Provider/Company Name', y = 'index', palette='rocket')
plt.xticks([])
plt.yticks(fontname = 'arial', fontsize = 12, color = '#283655')
plt.ylabel('')
plt.xlabel('')

a.spines['left'].set_linewidth(1.5)
for w in ['right', 'top', 'bottom']:
    a.spines[w].set_visible(False)
    
for p in a.patches:
    width = p.get_width()
    plt.text(0.5 + width, p.get_y() + 0.55 * p.get_height(), f'{int(width)}',
             ha = 'center', va = 'center', fontname = 'arial', fontsize = 15, color = '#283655')
plt.show()

In [None]:
fig = px.pie(products['Sector(s)'].value_counts().reset_index().rename(columns = {'Sector(s)': 'count'}).head(15), values = 'count', names = 'index', width = 650, height = 650)

fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.7,marker = dict(colors = ['#336b87','#2a3132','#763626']))

fig.update_layout(annotations = [dict(text = 'Sector of education <br>where the product is used', 
                                      x = 0.5, y = 0.5, font_size = 20, showarrow = False, 
                                      font_family = 'arial',
                                      font_color = '#283655')],
                  showlegend = False)
                  
fig.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16,4))
sns.countplot(data=products, x='funct_main', palette ='rocket', ax=ax[0])
ax[0].set_title('Main Categories in Primary Functions')

sns.countplot(data=products[products['funct_main'] == 'LC'], x='funct_sub', palette ='rocket', ax=ax[1])
ax[1].set_title('Sub-Categories in Primary Function LC')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)
plt.show()

**3. Engagment of students**

Firstly, I remove weekends because there are no classes on weekends. It avoids the disturbing visuals.

In [None]:
engagement_imputed['weekday'] = engagement_imputed['time'].dt.dayofweek
engagement_only_weekday = engagement_imputed[engagement_imputed.weekday < 5]

In [None]:
#get all products of " Virtual Classroom" category
vrclass_lp_id = products[products.funct_sub == 'Virtual Classroom']['LP ID'].unique()
vrclass_lp_id

#function to annotate the interval of x-axis. Refer https://stackoverflow.com/questions/38677467/how-to-annotate-a-range-of-the-x-axis-in-matplotlib
def annotation_line( ax, xmin, xmax, y, text, ytext=150, linecolor='black', linewidth=1, fontsize=12 ):

    ax.annotate('', xy=(xmin, y), xytext=(xmax, y), xycoords='data', textcoords='data',
            arrowprops={'arrowstyle': '|-|', 'color':linecolor, 'linewidth':linewidth})
    ax.annotate('', xy=(xmin, y), xytext=(xmax, y), xycoords='data', textcoords='data',
            arrowprops={'arrowstyle': '<->', 'color':linecolor, 'linewidth':linewidth})

    xcenter = xmin + (xmax-xmin)/2
    if ytext==0:
        ytext = y + ( ax.get_ylim()[1] - ax.get_ylim()[0] ) / 20
    ax.annotate( text, xy=(xcenter,ytext), ha='center', va='center', fontsize=fontsize)
    

In [None]:
# pct_access by products
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for i in vrclass_lp_id:
  temp = engagement_only_weekday[engagement_only_weekday.lp_id==i].groupby('time').pct_access.mean().to_frame().reset_index(drop = False)
  sns.lineplot(data=temp, x = temp.time, y= temp.pct_access, label = products[products['LP ID']==i]['Product Name'].values[0], palette='rocket')
ax.annotate('WHO declared the pandemic',
            xy =( np.datetime64('2020-03-11'), 0),
            xycoords='data',
            xytext=(0, 150),
            size = 13,
            textcoords='offset points',
            arrowprops=dict(arrowstyle='->', color='black'),
            ha='center',
            va='center')

annotation_line( ax=ax, text='Summer break', xmin=np.datetime64('2020-06-25'), xmax=np.datetime64('2020-08-10'),
                    y=3, ytext=5, linewidth=2, linecolor='black', fontsize=14 )

ax.set_title('Percentage students have at least one page-load event on a given day with the Virtual Class products')
plt.legend()
plt.show()

With the plot above, we can notice that:

* Zoom and Meet are the two most prevalent software for online classes.
* Homeschooling starts at the beginning after WHO declared the pandemic.
* Summer break is during July and August, thus there are almost no activities.
* The noticeable increase in use of Zoom and Meet products after summer break due to the new wave of pandemic.
* There are a few drop points throughout the year. These might be the national holidays.
* During the winter term, it was just 15 % of students in the districts have at least one page-load event of Zoom or Meet. It means not all students had to attend classes virtually. They could have been able to have classes on campus. Compare to [State-by-State Map of Where School Buildings Are Opened or Close](http//https://www.edweek.org/leadership/map-where-are-schools-closed/2020/07), it seems to hold true because a lot of schools * offered in-person lectures.

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for i in vrclass_lp_id:
  temp = engagement_only_weekday[engagement_only_weekday.lp_id==i].groupby('time').engagement_index.mean().to_frame().reset_index(drop = False)
  sns.lineplot(data=temp, x = temp.time, y= temp.engagement_index, label = products[products['LP ID']==i]['Product Name'].values[0], palette='rocket')
ax.annotate('WHO declared the pandemic',
            xy =( np.datetime64('2020-03-11'), 0),
            xycoords='data',
            xytext=(0, 150),
            size = 13,
            textcoords='offset points',
            arrowprops=dict(arrowstyle='->', color='black'),
            ha='center',
            va='center')

annotation_line( ax=ax, text='Summer break', xmin=np.datetime64('2020-06-25'), xmax=np.datetime64('2020-08-10'),
                    y=1000, ytext=1500, linewidth=2, linecolor='black', fontsize=14 )

ax.set_title('Total page-load events per 1000 students on a given day with the Virtual Class products')
plt.legend()
plt.show()

As observed, pct_access Zoom and Meet seem to have roughly similiar values but Meet has more than 4 times the value of Zoom for engagement_index in the last quarter of 2020. It means If we have 1000 page-load events per 1000 students for Zoom on a given day that means that one student uses Zoom once a day. In contrast, Meet is used 4 or 5 times daily on average per student.

In [None]:
products['lp_id'] =products['LP ID'].copy()
fig, ax = plt.subplots(3,3, figsize = (18, 8))
sub = products[products.funct_main == 'LC'].funct_sub.unique()
i = 0
j = 0
for k in sub:
  lp_ids = products[products.funct_sub == k]['LP ID'].unique()
  temp = engagement_only_weekday[engagement_only_weekday['lp_id'].isin(lp_ids)]
  temp = temp.groupby('lp_id').pct_access.mean().sort_values(ascending = False).to_frame().reset_index(drop = False)
  temp = temp.merge(products[['lp_id', 'Product Name']], on='lp_id').head()
  sns.barplot(data = temp, x='pct_access', y='Product Name', palette='rocket', ax=ax[i,j])
  ax[i, j].set_title(f'Top 5 in \n{k}', fontsize=12)
  ax[i, j].set_xlim([0, 20])
  j = j + 1
  if j == 3:
    i = i + 1
    j = 0
fig.delaxes(ax[2, 1])
fig.delaxes(ax[2, 2])
plt.tight_layout()
plt.show()

Anatomize the most common category: LC = Learning & Curriculum, we see 5 sub-categories and top 5 most accessed products of each. We notice that most of products are on average by less than 5% students on daily basis except Google Docs, Youtube, Canvas. It's hard to say exactly about the trend of using Youtube because it serves for both studying and entertaining. In contrast, Google Docs and Canvas make a lot of sense, since these are to use for education. The average pct_access of Career Planning and Job Search is very low, it might be due to the fact that it is only relevant to the senior students.