We will use Python for data processing, understanding and insights.

Import python libraries for data load and later import other libraries as needed

In [None]:
import numpy as np 
import pandas as pd 
import glob
import os 
import matplotlib.pyplot as plt 
import seaborn as sns 

**Data Definition:**

A) District information data
* The district file districts_info.csv includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. 
 * In this data set, we removed the identifiable information about the school districts. We also used an open source tool ARX (Prasser et al. 2020) to transform several data fields and reduce the risks of re-identification. 
 * For data generalization purposes some data points are released with a range where the actual value falls under. 
  * Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

In [None]:
districts_data = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')

In [None]:
districts_data.info()

It appears that there are missing values for few features like state, locale etc. and we will check further.

In [None]:
districts_data.head(10)

In [None]:
districts_data.describe(include='all')

In [None]:
districts_data.isna().sum()

In [None]:
districts_data.dropna(thresh=6, inplace=True)

We can drop rows which has missing value for all the columns as they are not useful for further analysis.

In [None]:
districts_data.isna().sum()

**Q: What is the distribution count of locale feature?**

In [None]:
districts_data['locale'].value_counts()

In [None]:
districts_data['locale'].value_counts().plot(kind='pie', explode=[0.05,0.05,0.05,0.05], fontsize=14, autopct='%3.1f%%', 
                                               figsize=(5,5), shadow=True, startangle=135, legend=False, cmap='summer')

plt.ylabel('Locale Type')

We can see that locale 'Suburb' has majority of data points and then 'Rural' followed by 'City' and 'Town'.

In [None]:
sns.displot(data=districts_data, x='state', hue= 'locale', height=8, aspect=3)

In [None]:
sns.displot(data=districts_data, y='state', hue= 'locale', col='locale',height=4, aspect=.6)

* Above plot shows that state 'Illinois', 'Utah', 'Connecticut', Massachusetts' and 'Ohio' has majority data points for locale 'Suburb'.
* State 'Connecticut', 'New York', Illinois and 'Indiana' has majority data points for locale 'Rural'.
* State 'California', 'Utah', 'Washington' and 'D.C.' has majority data points for locale 'City'.
* State 'Utah' has majority data points for locale 'Town'

In [None]:
pct_black_hispanic = districts_data['pct_black/hispanic'].str.split(",",n=1,expand=True)

districts_data['pct_black']=pct_black_hispanic[0].str.replace('[','',regex=True)
districts_data['pct_hispanic']= pct_black_hispanic[1].str.replace('[','',regex=True)

districts_data['pct_black']=pd.to_numeric(districts_data['pct_black'])
districts_data['pct_hispanic']=pd.to_numeric(districts_data['pct_hispanic'])

districts_data['pct_black_and_hispanic']=(districts_data['pct_black'] + districts_data['pct_hispanic'])/2

In [None]:
sns.displot(data=districts_data, x='pct_black_and_hispanic', hue='locale',kind='kde')

Above plot shows that locale 'Suburb' and 'Rural' appears to have majority of data points for pct_black_and_hispanic


In [None]:

pct_free_reduced = districts_data['pct_free/reduced'].str.split(",",n=1,expand=True)

districts_data['pct_free']=pct_free_reduced[0].str.replace('[','',regex=True)
districts_data['pct_reduced']= pct_free_reduced[1].str.replace('[','',regex=True)

districts_data['pct_free']=pd.to_numeric(districts_data['pct_free'])
districts_data['pct_reduced']=pd.to_numeric(districts_data['pct_reduced'])

In [None]:

districts_data['pct_free'].fillna(districts_data['pct_free'].median(), inplace=True)
districts_data['pct_reduced'].fillna(districts_data['pct_reduced'].median(), inplace=True)

districts_data['pct_free_and_reduced']=(districts_data['pct_free'] + districts_data['pct_reduced'])/2

In [None]:
sns.displot(data=districts_data, x='pct_free_and_reduced', hue='locale',kind='kde')

Above plot shows that locale 'Suburb' appears to have majority of data points for pct_free_and_reduced.

In [None]:

sns.displot(data=districts_data, x="pct_free_and_reduced", hue='state', height=8, aspect=.8)

Above plot shows that most of state in data points has average percentage of pct_free_and_reduced distribution.

In [None]:

county_connections_ratio = districts_data['county_connections_ratio'].str.split(",",n=1,expand=True)

districts_data['county']=county_connections_ratio[0].str.replace('[','',regex=True)
districts_data['connections']= county_connections_ratio[1].str.replace('[','',regex=True)

districts_data['county']=pd.to_numeric(districts_data['county'])
districts_data['connections']=pd.to_numeric(districts_data['connections'])

In [None]:

districts_data['county'].fillna(districts_data['county'].median(), inplace=True)
districts_data['connections'].fillna(districts_data['connections'].median(), inplace=True)

districts_data['cc_ratio'] = (districts_data['county'] + districts_data['connections'])/2

In [None]:
sns.displot(data=districts_data, x="cc_ratio", hue='state', height=5, aspect=.8)

Above plot shows that majority of state in data points has lesser percentage of cc_ratio distribution.

In [None]:
sns.displot(data=districts_data, x="cc_ratio", hue='locale', height=5, aspect=.8)

Above plot shows that majority of locale 'Suburb' in data points has higher count for cc_ratio distribution.

In [None]:

pp_total_raw = districts_data['pp_total_raw'].str.split(",",n=1,expand=True)

districts_data['pp_local']=pp_total_raw[0].str.replace('[','',regex=True)
districts_data['pp_federal']= pp_total_raw[1].str.replace('[','',regex=True)

districts_data['pp_local']=pd.to_numeric(districts_data['pp_local'])
districts_data['pp_federal']=pd.to_numeric(districts_data['pp_federal'])

In [None]:

districts_data['pp_local'].fillna(districts_data['pp_local'].median(), inplace=True)
districts_data['pp_federal'].fillna(districts_data['pp_federal'].median(), inplace=True)

districts_data['pp_local_and_federal']=(districts_data['pp_local'] + districts_data['pp_federal'])/2

In [None]:
sns.displot(data=districts_data, x='pp_local_and_federal', hue='locale',kind='kde')

Above plot shows that locale 'Suburb' appears to have majority of data points for pp_local_and_federal.

In [None]:
sns.displot(data=districts_data, x="pp_local_and_federal", hue='state', height=8, aspect=.8)

In [None]:
districts_data.info()

In [None]:
districts_data = districts_data.drop(columns=['pct_black/hispanic','pct_black','pct_hispanic','pct_free/reduced','pct_free','pct_reduced','county_connections_ratio','pp_total_raw','county','connections','pp_local','pp_federal'])
districts_data.info()

In [None]:
districts_data.rename(columns={'pct_black_and_hispanic':'pct_black/hispanic'}, inplace=True)
districts_data.rename(columns={'pct_free_and_reduced':'pct_free/reduced'}, inplace=True)
districts_data.rename(columns={'cc_ratio':'county_connections_ratio'}, inplace=True)
districts_data.rename(columns={'pp_local_and_federal':'pp_total_raw'}, inplace=True)

In [None]:
districts_data.info()

In [None]:

districts_data.groupby(['state'])[['pct_black/hispanic','pct_free/reduced']].median().plot(kind='bar', figsize=(15, 7), color=['red','blue'])

In [None]:

districts_data.groupby(['locale'])[['pct_black/hispanic','pct_free/reduced']].median().plot(kind='bar', figsize=(15, 7), color=['black','green'])

In [None]:
sns.boxplot(x='locale',y='pct_black/hispanic',data=districts_data, palette='rainbow')

In [None]:
sns.boxplot(x='locale',y='pct_free/reduced',data=districts_data, palette='rainbow')

In [None]:
sns.boxplot(x='locale',y='county_connections_ratio',data=districts_data, palette='rainbow')

In [None]:
sns.boxplot(x='locale',y='pp_total_raw',data=districts_data, palette='rainbow')

**Data Definition:**

B) Product information data
* The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020. 
 * The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. 
 * Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

In [None]:
products_data = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
products_data.info()

In [None]:
products_data.head(10)

In [None]:
products_data.describe(include='all')

In [None]:
products_data.rename(columns={'LP ID':'lp_id'}, inplace=True)

In [None]:
products_data.isna().sum()

In [None]:
products_data['Provider/Company Name'].fillna("Missing", inplace=True)

In [None]:
products_data['Sector(s)'].value_counts()

In [None]:
sector_mode= products_data['Sector(s)'].mode()
products_data['Sector(s)'].fillna(value=sector_mode[0], inplace=True)

**Q: What is the distribution count for Sectors feature?**

In [None]:
products_data['Sector(s)'].value_counts()

In [None]:
products_data['Sector(s)'].value_counts().plot(kind='pie', explode=[0.05,0.05,0.05,0.05,0.05], fontsize=14, autopct='%3.1f%%', 
                                               figsize=(5,5), shadow=True, startangle=135, legend=False, cmap='summer')

plt.ylabel('Sector(s)')

In [None]:
sns.displot(data=products_data, x='Sector(s)', height=5, aspect=2)

Above plot shows that majority of data points for Sectors is around 'PreK-12'

In [None]:
essential_mode= products_data['Primary Essential Function'].mode()
products_data['Primary Essential Function'].fillna(value=essential_mode[0], inplace=True)

In [None]:
sns.displot(data=products_data, y='Primary Essential Function', height=8, aspect=2)

Above plot shows that majority of data points is for Primary Essential Function is for LC based function.

**Data Definition:**

C) Engagement data
* The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district. 
 * The 4-digit file name represents district_id which can be used to link to district information in districts_info.csv. 
 * The lp_id can be used to link to product information in products_info.csv.

In [None]:

path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
data = []

for f in csv_files:
  
    frame = pd.read_csv(f)
   
    filename = os.path.splitext(f)
 
    frame['district_id'] = os.path.basename(filename[0])
  
    data.append(frame)
    
    
    engagement_data = pd.concat(data)

In [None]:
engagement_data.info(show_counts=True)

In [None]:
engagement_data.head(10)

In [None]:
engagement_data.describe(include='all')

In [None]:
engagement_data['district_id'] = pd.to_numeric(engagement_data['district_id'])

In [None]:
engagement_data.info(show_counts=True)

In [None]:
engagement_data['lp_id']= engagement_data['lp_id'].fillna(0.0).astype(int)

In [None]:
engagement_data.info(show_counts=True)

In [None]:
engagement_data.head(10)

In [None]:
learnPlatform_data = pd.merge(engagement_data,districts_data,on=['district_id'])

In [None]:
learnPlatform_data.info(show_counts=True)

In [None]:
learnPlatform_data.head(10)

In [None]:
learnPlatform_data = pd.merge(learnPlatform_data,products_data,on=['lp_id'])

In [None]:
learnPlatform_data.info(show_counts=True)

In [None]:
learnPlatform_data['engagement_index'].fillna(learnPlatform_data['engagement_index'].median(), inplace=True)

learnPlatform_data['pct_access'].fillna(learnPlatform_data['pct_access'].median(), inplace=True)

**Data Definition:**

D) LearnPlatform data
* learnPlatform_data is merge result of engagement_data with districts_data and products_data. 

In [None]:
learnPlatform_data.info(show_counts=True)

In [None]:
learnPlatform_data.head(10)

In [None]:
learnPlatform_data.describe(include='all').transpose()

In [None]:
learnPlatform_data['time']= pd.to_datetime(learnPlatform_data['time'])

In [None]:
learnPlatform_data.info()

In [None]:
learnPlatform_data.groupby('Primary Essential Function')[['engagement_index']].median().plot(kind='bar', figsize=(15, 7), color=['blue'])

Above plot shows that engagement_index is higher for Primary Essential Function such as 'SDO-School Management Software- Mobile Device Management', 'SDO - Learning Management System (LMS)', 'SDO -School Management Software -SSO' and 'LC- Sites, Resources and References - Encyclopedia' followed by 'LC-Sites, Resources and References - Streaming Services' and 'LC - Study Tools'.

In [None]:
learnPlatform_data.groupby('Primary Essential Function')[['pct_access']].median().plot(kind='bar', figsize=(15, 7), color=['orange'])

Above plot shows that pct_access is higher for Primary Essential Function such as 'SDO-School Management Software- Mobile Device Management', 'SDO -School Management Software -SSO' followed by 'LC- Sites, Resources and References - Encyclopedia'.

In [None]:

learnPlatform_data.groupby('Sector(s)')[['engagement_index']].median().plot(kind='bar', figsize=(15, 7), color=['green'])

In [None]:

learnPlatform_data.groupby('Sector(s)')[['pct_access']].median().plot(kind='bar', figsize=(15, 7), color=['lightgreen'])

In [None]:
learnPlatform_data.corr()

In [None]:
corr = learnPlatform_data.corr(method='pearson')
plt.figure(figsize=(15,15))
sns.heatmap(corr,vmax=.8,linewidth=.01, square = True, annot = True,cmap='YlGnBu',linecolor ='pink')

In [None]:

learnPlatform_data.groupby(['locale'])[['engagement_index','pct_access','pct_black/hispanic','pct_free/reduced']].median().plot(kind='bar', figsize=(15, 7), stacked=True)

In [None]:

learnPlatform_data.groupby(['Sector(s)'])[['engagement_index','pct_access','pct_black/hispanic','pct_free/reduced']].median().plot(kind='bar', figsize=(15, 7), stacked=True)

In [None]:

learnPlatform_data.groupby(['state'])[['engagement_index','pct_access','pct_black/hispanic','pct_free/reduced']].median().plot(kind='bar', figsize=(15, 7), stacked=True)

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='engagement_index',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='locale',y='engagement_index',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='pct_access',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='locale',y='pct_access',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='pct_black/hispanic',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='pct_free/reduced',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='pp_total_raw',data=learnPlatform_data, palette='rainbow')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Sector(s)',y='county_connections_ratio',data=learnPlatform_data, palette='rainbow')

In [None]:
learnPlatform_data_by_date= learnPlatform_data.groupby('time').count()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x='time',y='engagement_index',data=learnPlatform_data_by_date)

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x='time',y='pct_access',data=learnPlatform_data_by_date)

In [None]:
learnPlatform_data_by_date.plot(figsize=(15,8))

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(data=learnPlatform_data_by_date)

In [None]:
from pandas.plotting import lag_plot
plt.figure(figsize=(15,8))
lag_plot(learnPlatform_data_by_date.engagement_index)
plt.show()

In [None]:
plt.figure(figsize=(15,8))
lag_plot(learnPlatform_data_by_date.pct_access)
plt.show()

**Q:What are the top 10 Product Name with respect to Engagement Index?**

In [None]:

top_product_name = learnPlatform_data.groupby(by = 'Product Name', as_index = False).agg('sum').sort_values(by ='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 8])

sns.barplot(x = 'engagement_index', y ='Product Name', data = top_product_name[0:10])
# set plot label
plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Product Name', size = 14)
plt.title(label = 'Top 10 Product Name', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top Sectors with respect to Engagement Index?**

In [None]:

top_sectors = learnPlatform_data.groupby(by = 'Sector(s)', as_index = False).agg('sum').sort_values(by ='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 8])

sns.barplot(x = 'engagement_index', y ='Sector(s)', data = top_sectors[0:5])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Sectors', size = 14)
plt.title(label = 'Top Sectors', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top 10 Primary Essential Function with respect to Engagement Index?**

In [None]:

top_primary_essential = learnPlatform_data.groupby(by = 'Primary Essential Function', as_index = False).agg('sum').sort_values(by ='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 10])

sns.barplot(x = 'engagement_index', y ='Primary Essential Function', data = top_primary_essential[0:10])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Primary Essential Function', size = 14)
plt.title(label = 'Top 10 Primary Essential Function', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top Primary Essential Function in LC Level with respect to Engagement Index?**

In [None]:

learnPlatform_LC=learnPlatform_data[learnPlatform_data['Primary Essential Function'].str.contains("LC - ")]
top_primary_lc = learnPlatform_LC.groupby(by='Primary Essential Function', as_index=False).agg('sum').sort_values(by='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 8])

sns.barplot(x = 'engagement_index', y ='Primary Essential Function', data = top_primary_lc[0:5])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Primary Essential Function', size = 14)
plt.title(label = 'Top 5 Primary Essential Function in LC Level', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top Primary Essential Function in CM Level with respect to Engagement Index?**

In [None]:
learnPlatform_CM=learnPlatform_data[learnPlatform_data['Primary Essential Function'].str.contains("CM - ")]
top_primary_cm = learnPlatform_CM.groupby(by='Primary Essential Function', as_index=False).agg('sum').sort_values(by='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 8])

sns.barplot(x = 'engagement_index', y ='Primary Essential Function', data = top_primary_cm[0:5])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Primary Essential Function', size = 14)
plt.title(label = 'Top 5 Primary Essential Function in CM Level', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top Primary Essential Function in SDO Level with respect to Engagement Index?**

In [None]:
learnPlatform_SDO=learnPlatform_data[learnPlatform_data['Primary Essential Function'].str.contains("SDO - ")]
top_primary_sdo = learnPlatform_SDO.groupby(by='Primary Essential Function', as_index=False).agg('sum').sort_values(by='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 8])

sns.barplot(x = 'engagement_index', y ='Primary Essential Function', data = top_primary_sdo[0:5])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Primary Essential Function', size = 14)
plt.title(label = 'Top 5 Primary Essential Function in SDO Level', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top 10 States with respect to Engagement Index?**

In [None]:
top_state = learnPlatform_data.groupby(by = 'state', as_index = False).agg('sum').sort_values(by ='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 10])

sns.barplot(x = 'engagement_index', y ='state', data = top_state[0:10])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'State', size = 14)
plt.title(label = 'Top 10 State', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()

**Q:What are the top Locale with respect to Engagement Index?**

In [None]:

top_locale = learnPlatform_data.groupby(by = 'locale', as_index = False).agg('sum').sort_values(by ='engagement_index', ascending = False)

figure = plt.figure(figsize = [15, 10])

sns.barplot(x = 'engagement_index', y ='locale', data = top_locale[0:4])

plt.xlabel(xlabel = 'Engagement Index', size = 14)
plt.ylabel(ylabel = 'Locale', size = 14)
plt.title(label = 'Top Locale', size = 16)
plt.grid(b = True, axis = 'x')
plt.show()