In [None]:
import pandas as pd

**Overview**

LearnPlatform was founded in 2014 with a mission to expand equitable access to education technology for all students and teachers. Currently, its edtech effectiveness system is used by districts and states to continuously improve the safety, equity, and effectiveness of their education technology. 


**Objective**

This analysis will uncover trends in digital learning and explore the impact of Covid-19 on overall engagement on multiple product usage



**Data**

Three types of data available for analysis:
* Products – all information about a certain product, including company, product type and product name
* Districts – all information about a certain district, including state, black/latin ratio and * government expenditure
* Engagement


**Key Takeways**

* There are three major variables impacting the digital product usage the most
* Impact of the Covid - since March, the engagement of Google Docs and Google Classroom spiked and further trend up in 2020 Fall. Some other products such as YouTube and Meet were not used by teachers and students before the pandemic
* Government Expenditure - We found there is a positive relationship between the government expenditure and product engagement.

**Load Datasets**

In [None]:
Products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
Products.head()

In [None]:
Products.columns = ['lp_id', 'URL', 'Product Name', 'Provider/Company Name', 'Sector(s)',
       'Primary Essential Function']

In [None]:
print('There are total ' + str(Products.shape[0]) + ' samples')

In [None]:
print('missing values: ')
Products.isnull().sum() / Products.shape[0]

In [None]:
Products = Products[~Products['Sector(s)'].isnull()]
Products = Products[~Products['Primary Essential Function'].isnull()]
Products = Products[~Products['Provider/Company Name'].isnull()]

In [None]:
districts = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')

In [None]:
districts.head()

In [None]:
districts['pct_black/hispanic'] = districts['pct_black/hispanic'].str.strip('[]')
districts[['pct_black/hispanic min', 'pct_black/hispanic max']] = districts['pct_black/hispanic'].str.split(",",expand = True).astype(float)



In [None]:
districts['pct_free/reduced'] = districts['pct_free/reduced'].str.strip('[]')
districts[['pct_free/reduced min', 'pct_free/reduced max']] = districts['pct_free/reduced'].str.split(",",expand = True).astype(float)



In [None]:
districts['county_connections_ratio'] = districts['county_connections_ratio'].str.strip('[]')
districts[['county_connections_ratio min', 'county_connections_ratio max']] = districts['county_connections_ratio'].str.split(",",expand = True).astype(float)



In [None]:
districts['pp_total_raw'] = districts['pp_total_raw'].str.strip('[]')
districts[['expenditure min', 'expenditure max']] = districts['pp_total_raw'].str.split(",",expand = True).astype(float)



In [None]:
districts = districts.drop(['pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio', 'pp_total_raw'], axis = 1)



In [None]:
print('there are total '+str(districts.shape[0]) + ' in the data')

In [None]:
print('missing values: ')
districts.isnull().sum() / districts.shape[0]

In [None]:
for col in districts.columns:
    if col == 'pct_black/hispanic min' or col == 'pct_black/hispanic max':
        districts[col] = districts[col].fillna(0)
    elif col == 'pct_free/reduced min' or col == 'pct_free/reduced max' or col == 'expenditure min' or col == 'expenditure max':
        districts[col] = districts[col].fillna(districts[col].median())



In [None]:
districts = districts.drop(['county_connections_ratio min', 'county_connections_ratio max'], axis = 1)


In [None]:
all_files = """1000.csv* 2074.csv* 3160.csv* 4051.csv* 5057.csv* 6250.csv* 7785.csv* 8884.csv*1039.csv* 2106.csv* 3188.csv* 4083.csv* 5150.csv* 6345.csv* 7798.csv* 8902.csv*1044.csv* 2130.csv* 3222.csv* 4165.csv* 5231.csv* 6418.csv* 7829.csv* 8937.csv*1052.csv* 2165.csv* 3228.csv* 4183.csv* 5257.csv* 6512.csv* 7858.csv* 9007.csv*1131.csv* 2167.csv* 3248.csv* 4203.csv* 5380.csv* 6577.csv* 7964.csv* 9043.csv*1142.csv* 2172.csv* 3266.csv* 4314.csv* 5404.csv* 6584.csv* 7970.csv* 9120.csv*1179.csv* 2201.csv* 3301.csv* 4348.csv* 5422.csv* 6640.csv* 7975.csv* 9140.csv*1204.csv* 2209.csv* 3314.csv* 4373.csv* 5479.csv* 6665.csv* 7980.csv* 9230.csv*1270.csv* 2238.csv* 3322.csv* 4408.csv* 5510.csv* 6721.csv* 8017.csv* 9303.csv*1324.csv* 2257.csv* 3371.csv* 4516.csv* 5524.csv* 6762.csv* 8076.csv* 9357.csv*1444.csv* 2285.csv* 3390.csv* 4520.csv* 5527.csv* 6774.csv* 8103.csv* 9463.csv*1450.csv* 2321.csv* 3393.csv* 4550.csv* 5600.csv* 6919.csv* 8127.csv* 9478.csv*1470.csv* 2339.csv* 3412.csv* 4569.csv* 5604.csv* 6998.csv* 8160.csv* 9515.csv*1536.csv* 2393.csv* 3471.csv* 4591.csv* 5627.csv* 7086.csv* 8184.csv* 9536.csv*1549.csv* 2439.csv* 3550.csv* 4602.csv* 5802.csv* 7164.csv* 8256.csv* 9537.csv*1558.csv* 2441.csv* 3558.csv* 4629.csv* 5882.csv* 7177.csv* 8328.csv* 9553.csv*1570.csv* 2517.csv* 3580.csv* 4666.csv* 5890.csv* 7305.csv* 8425.csv* 9589.csv*1584.csv* 2549.csv* 3640.csv* 4668.csv* 5903.csv* 7308.csv* 8433.csv* 9729.csv*1624.csv* 2567.csv* 3668.csv* 4683.csv* 5934.csv* 7342.csv* 8515.csv* 9778.csv*1705.csv* 2598.csv* 3670.csv* 4744.csv* 5970.csv* 7352.csv* 8520.csv* 9812.csv*1712.csv* 2601.csv* 3692.csv* 4749.csv* 5987.csv* 7387.csv* 8539.csv* 9839.csv*1742.csv* 2685.csv* 3710.csv* 4775.csv* 6046.csv* 7457.csv* 8556.csv* 9899.csv*1772.csv* 2729.csv* 3732.csv* 4808.csv* 6049.csv* 7541.csv* 8685.csv* 9927.csv*1791.csv* 2779.csv* 3772.csv* 4921.csv* 6055.csv* 7614.csv* 8702.csv*1857.csv* 2870.csv* 3864.csv* 4929.csv* 6066.csv* 7660.csv* 8723.csv*1877.csv* 2872.csv* 3936.csv* 4936.csv* 6104.csv* 7675.csv* 8748.csv*1904.csv* 2940.csv* 3959.csv* 4937.csv* 6131.csv* 7723.csv* 8784.csv*1965.csv* 2956.csv* 3986.csv* 4949.csv* 6144.csv* 7741.csv* 8796.csv*2017.csv* 2991.csv* 4029.csv* 5006.csv* 6165.csv* 7752.csv* 8815.csv*2060.csv* 3080.csv* 4031.csv* 5042.csv* 6194.csv* 7767.csv* 8845.csv*"""
all_files_Array = all_files.split('*')


In [None]:
import numpy as np

for i, file in enumerate(all_files_Array):
    file = file.strip()
    file = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/' + file
    all_files_Array[i] = file

In [None]:
all_files_Array = all_files_Array[:-1]

In [None]:
li = []
for file_nm in all_files_Array:
    df = pd.read_csv(file_nm, index_col = None, header = 0)
    #print('finish uploading' + file_nm)
    distict_id = file_nm.split("/")[4].split(".")[0]
    df["district_id"] = distict_id
    #print(distict_id)
    #print(df.head())
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df.tail()

In [None]:
print('checking missing values...')
print(engagement_df.isnull().sum()/engagement_df.shape[0])

In [None]:
"""
Since there is only a very small portion of missing value in the column pct_access and lp_id, 
we will just remove these items
"""
engagement_df = engagement_df[~engagement_df.pct_access.isnull()]
engagement_df = engagement_df[~engagement_df.lp_id.isnull()]

In [None]:
"""
there is a larger portion of missing value in the column engagement index. Therefore, we can't just
simply remove the missing values. 

the value of engagement_index is a bit skewed. most of the values are close to zero and some values
are extremely large which pull up the average. Thus, we will use median value to impute missing 
values in this case.
"""
missing_col = ['engagement_index']
## using mean to impute the missing values
for i in missing_col:
    engagement_df.loc[engagement_df.loc[:,i].isnull(),i] = engagement_df.loc[:,i].median()

In [None]:
engagement_df['month'] = pd.DatetimeIndex(engagement_df.time).month
engagement_df_mnth = engagement_df.groupby(['month','district_id', 'lp_id'] ).sum()
engagement_df_mnth.reset_index(inplace=True)

In [None]:
print('after aggregating at monthly level...')
print('there are total ' + str(engagement_df_mnth.shape[0]) + ' samples in the engagement data')

In [None]:
districts['district_id'] = districts['district_id'].apply(str)

"""
Merge engagement with distriction data
"""

eng_dist = pd.merge(engagement_df_mnth, districts, on = 'district_id', how = 'inner')
print(eng_dist.head())

In [None]:
"""
Merge engagement with distriction data
"""

eng_dist_prod = pd.merge(eng_dist, Products, on = 'lp_id', how = 'inner')
print(eng_dist_prod.head())

In [None]:
df = eng_dist_prod.drop([ 'Product Name', 'district_id', 'URL','pct_access'], axis = 1)

**Feature Selection - Random Forest**

In [None]:
y = df.engagement_index 

In [None]:
df.columns

In [None]:
df = df[~df.state.isnull()]
df = df[~df.locale.isnull()]

In [None]:
y = df.engagement_index 

In [None]:
X_num = df[['lp_id', 
       'pct_black/hispanic min', 'pct_black/hispanic max', 'pct_free/reduced min', 'pct_free/reduced max',
       'expenditure min',
       'expenditure max', 'month']]
X_cat = df[['state', 'locale', 'Provider/Company Name',
       'Sector(s)', 'Primary Essential Function']]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_num= scaler.fit_transform(X_num)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in X_cat.columns:
    le = LabelEncoder()
    X_cat[col] = le.fit_transform(X_cat[col])

In [None]:
X = np.concatenate((X_num, X_cat), axis = 1)

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor()
RF.fit(X, y)

In [None]:
"""
the model performs very well with R-squared 0.89
the next step is getting which feature impact the engagement index the most.
"""

print('score:')
print(RF.score(X, y))


In [None]:
importances = RF.feature_importances_

print("Feature importance ranking by Random Forest Model:")
for k,v in sorted(zip(map(lambda x: round(x, 4), importances), ['lp_id', 
       'pct_black/hispanic min', 'pct_black/hispanic max', 'pct_free/reduced min', 'pct_free/reduced max',
       'expenditure min',
       'expenditure max', 'month', 'state', 'locale', 'Provider/Company Name',
       'Sector(s)', 'Primary Essential Function']), reverse=True):
    print (v + ": " + str(k))

There are four major features:
* products
* Seasonality or Covid impact
* geograohy - state
* government expenditure

**Individual Feature Exploration**

Products

In [None]:

"""
Merge engagement with distriction data
"""

eng_products = pd.merge(engagement_df_mnth, Products, on = 'lp_id', how = 'inner')
print(eng_products.head())

In [None]:
seasonality_plot = eng_products.groupby('month').sum().engagement_index

import matplotlib.pyplot as plt

plt.figure(figsize = (12,6))

seasonality_plot.plot()
plt.show()

The engagement in digital products began to trend up since March and went even higher when Fall started. This might be due to the Covid-19 Impact

In [None]:
"""
Next, we are going to explore the top products and how do their engagement change over the time
"""
print('Below are the top products:')
top_products = eng_products.groupby('Product Name').sum().engagement_index
top_products.reset_index()
print(top_products.sort_values( ascending=False).head(10))

Over half of the top products are from Google. Next we are going to focus on these top products and explore their trend of engagement over the time

In [None]:
"""
The following analysis will only focus on the top products
engagement trend across months
"""

Top_Prods = ['Google Docs', 'Google Classroom', 'Canvas', 'YouTube', 'Meet', 'Kahoot!', 'Google Forms', 'Google Drive', 'Schoology', 'Google Sheets']
plt.figure(figsize = (12,6))
for prod in Top_Prods:
    prod_eng = eng_products[eng_products['Product Name'] == prod]
    top_prod_plot = prod_eng.groupby('month').sum().engagement_index
    top_prod_plot.plot()

plt.legend(['Google Docs', 'Google Classroom', 'Canvas', 'YouTube', 'Meet', 'Kahoot!', 'Google Forms', 'Google Drive', 'Schoology', 'Google Sheets'])
plt.show()
    
                        

Pandemic started in Mar 2020. Since pandemic began, teachers and students began to use a lot more Google Docs and Google Classroom. Interestingly, the usage of Kahoot! Is decreasing. the assumption is Google Classroom is a better tool for teachers comparing to Kahoot! in the case of study from home.

Another new product being used after pandemic began is Meet and Youtube. people start using Meet in March and the usage is trending by in the Fall. Youtube is a very new thing and people start began using it since June.

For all other products, the usage doesn’t change too much before and after pandemic started.


**Districts & Goverment Expenditure**

In [None]:
eng_dist.columns = ['month', 'district_id', 'lp_id', 'pct_access', 'engagement_index',
       'state', 'locale', 'pct_black/hispanic min', 'pct_black/hispanic max',
       'pct_free/reduced min', 'pct_free/reduced max', 'expenditure_min',
       'expenditure_max']

In [None]:

"""
In this section, we are going to explore the relationship between government expentiture and 
digital product engagement.

to start with, we use the joined data set between monthly engagement and districts data and aggregate
at district level.
"""
import pandasql

q = 'select district_id, expenditure_max, count(distinct lp_id) product_cnts, sum(engagement_index) as engagements from eng_dist group by 1, 2'
district_explore=  pandasql.sqldf(q)

print(district_explore.head())

In [None]:
"""
Next, we bucket districts per each expentiture level and pick out the median district in terms of
engagement
"""

district_plot = district_explore.groupby('expenditure_max').median()


In [None]:
district_plot

In [None]:
import matplotlib.pyplot as plt

plt.scatter(district_plot.index, district_plot.engagements)
plt.show()

from scipy.stats import pearsonr
corr, _ = pearsonr(district_plot.index, district_plot.engagements)
print('')
print('the overall correlation between the government expentiture and engagement is '+ str(corr))

Overall correlation is 0.65. The rightmost point is a district from New York which received $34,000 federal expenditure in 2020. However, the engagement is not that great due to limited product usage. Please see chart below:


In [None]:
plt.scatter(district_plot.index, district_plot.product_cnts)
plt.show()

"""
although that district received $34,000 from the government, the number of products used is very 
limited which drag down the overall digital product usage.
"""

In [None]:
district_plot2 = district_plot[district_plot.index!=34000]

print('after removing the district')

corr2, _ = pearsonr(district_plot2.index, district_plot2.engagements)
print('the correlation between government expenditure and engagement is ' + str(corr2))

**In Conclusion, there is a positive correlation between the government expenditure and digital product engagement**