## INTRODUCTION

In [None]:
! apt install --yes libgraphviz-dev

In [None]:
! pip install pygraphviz

In [None]:
! pip install causalnex

# LearnPlatform COVID-19 Impact on Digital Learning
In this notebook we try to analyze and study the different factors that affect digital learning and the effect of COVID-19 on Learning and Engagement. We have encorporated different analysis of these factors with engagement.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

import re

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import matplotlib.pyplot as plt
%matplotlib inline


from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.structure.notears import from_pandas, from_pandas_lasso
import pygraphviz


from sklearn import preprocessing

## Load data
#### Load the csv files of district and product data and concatenate all the engagement csv files to a single file to do our analysis

In [None]:

districts_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
products_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

In [None]:
districts_info.head()

In [None]:
products_info.head()

### We will use the district_id to extract the files in the engagement folder

In [None]:
PATH = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 

temp = []

for district in districts_info.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    temp.append(df)
    
    
engagement_info = pd.concat(temp)
engagement_info = engagement_info.reset_index(drop=True)

In [None]:
engagement_info.head()

## Preprocess the data
#### Handle Missing values, convert to the correct type and remove unwanted cols 

### District Data
#### Drop null values

In [None]:
districts_info.info()

In [None]:
districts_info.dropna(inplace=True)

#### Edit the range values from '[0,0.2[' to [0,0.2]

In [None]:
districts_info_edited = districts_info.copy()

districts_info_edited['pct_black/hispanic'] = districts_info['pct_black/hispanic'].apply(lambda x: (x.replace('[', '')).split(','))
districts_info_edited['pct_free/reduced'] = districts_info['pct_free/reduced'].apply(lambda x: (x.replace('[', '')).split(','))
districts_info_edited['pp_total_raw'] = districts_info['pp_total_raw'].apply(lambda x: (x.replace('[', '')).split(','))
districts_info_edited.drop(columns=['county_connections_ratio'],inplace=True)
districts_info_edited


#### Taking the mean for the range values

In [None]:
for i in ['pct_black/hispanic','pct_free/reduced','pp_total_raw']:
    districts_info_edited[i] = districts_info_edited[i].apply(lambda x: (float(x[0])+float(x[1]))/2)
    

## Engagement Data

#### Converting time column to datetime type

In [None]:
engagement_info["time"] = pd.to_datetime(engagement_info["time"])

## PRE Exploratory Data Analysis on the features

In [None]:
class plot_aggregated:
    def __init__(self,df,groupedby,wanted_col,agg_metric,title,x=20,y=10,order=False):
        self.df = df
        self.groupedby = groupedby
        self.wanted_col = wanted_col
        self.agg_metric = agg_metric
        self.col_name = "aggregate "+self.agg_metric+" "+self.wanted_col
        self.order = order
#         self.top = top
        self.title = title
        self.x = x
        self.y = y
        self.calculate_aggregate()

        
    def calculate_aggregate(self): 
        agg_series = self.df.groupby(self.groupedby)[self.wanted_col].agg(self.agg_metric)
        self.agg_df = pd.DataFrame({self.groupedby: agg_series.keys(),self.col_name: agg_series})

    
    def plot_pie(self):
        fig = px.pie(self.agg_df, values=self.col_name, names=self.groupedby, title=self.title)
        fig.show()
    
    def plot_bar(self):
        fig = px.bar(self.agg_df,x =self.groupedby, y=self.col_name, title=self.title)
        fig.show()
        
    def plot_line(self):
        fig = px.line(self.agg_df,x =self.groupedby, y=self.col_name, title=self.title)
        fig.show()

In [None]:
plot_aggregated(df=districts_info_edited,groupedby="locale",wanted_col="locale",agg_metric="count",title="Distribution of Districts in the data from each locale").plot_pie()

In [None]:
plot_aggregated(df=districts_info_edited,groupedby="state",wanted_col="state",agg_metric="count",title="Distribution of Districts in the data from each state").plot_pie()


#### The above plots show, number of districts from the suburbs and Utah are higher in number than the others.

In [None]:
plot_aggregated(df=districts_info_edited,groupedby="state",wanted_col="pct_black/hispanic",agg_metric="sum",title="Percentage sum of Black/Hispanic Students per state").plot_bar()


In [None]:
plot_aggregated(df=districts_info_edited,groupedby="state",wanted_col="pct_free/reduced",agg_metric="sum",title="Percentage sum of Students eligible for free or reduced-price lunch per state").plot_bar()


In [None]:
plot_aggregated(df=districts_info_edited,groupedby="state",wanted_col="pp_total_raw",agg_metric="sum",title="Sum of total expenditure (sum of local and federal expenditure) per state").plot_bar()

#### From the above analysis we have found that the largest percentage of hispanic/black students as well as students elligible for free/reduced priced lunch are found in the states Utah and Illinois. We can also see these two states contain the highest sum of local and federal expenditure.

## Data Analysis


### Merge all the data we have to do a deeper Analysis

In [None]:
df_engagement_district = pd.merge(engagement_info, districts_info_edited, on='district_id')
df_merged = pd.merge(df_engagement_district, products_info, left_on='lp_id',right_on="LP ID")
df_merged.drop(columns=['LP ID'],inplace=True)
df_merged.head()


In [None]:
plot_aggregated(df=df_merged,groupedby="time",wanted_col="engagement_index",agg_metric="mean",title="Mean Engagement Index grouped per date").plot_line()

#### We can see the engagement drops around July 2020

#### Mean Engagement Index per each state

In [None]:
plot_aggregated(df=df_merged,groupedby="state",wanted_col="engagement_index",agg_metric="mean",title="Mean Engagement Index per each state").plot_bar()

In [None]:
plot_aggregated(df=df_merged,groupedby="locale",wanted_col="engagement_index",agg_metric="mean",title="Mean Engagement Index per each state").plot_bar()

#### The data shows the mean engagement index From Rural areas is the highest where as from the states found in our data New York shows the highest engagement index.

PRODUCTS DATA

In [None]:
products_info.shape

In [None]:
products_info.info()

The data has 372 records, some columns have missing value.
The following data is missing

In [None]:
products_info[products_info.isnull().any(axis=1)]

Most values are missing from the sector and essential function column, a column that lacks the sector also lacks the essential function.
Each company is producing unique products, a company can produce products for different sectors and functions. This means we cannot accurately use other products to infer the sector and functions for the missing values. These columns will be dropped.

In [None]:
products_info = products_info.dropna().reset_index(drop=True)

In [None]:
products_info.info()

In [None]:
# code adapted from https://www.kaggle.com/iamleonie/how-to-approach-analytics-challenges
#split the Primary Essential Function column

products_info['primary_function_main'] = products_info['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_info['primary_function_sub'] = products_info['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products_info['primary_function_sub'] = products_info['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_info.drop("Primary Essential Function", axis=1, inplace=True)
products_info.head()

## Overview of digital connectivity and engagement in 2020
Most states were already engaging with digital learning material before the pandemic began. Insights from research by the World Economic Forum(WEF), https://www.weforum.org/agenda/2020/04/coronavirus-education-global-covid19-online-digital-learning/, show that digital learning was already a trend before the pandemic but there has been a significant surge after the pandemic.

The summer holidays present a significant turning point for most of the factors influencing engagement. It seems to have provided people with a chance to decide on the infrastructure to use and put resources in place to enable the use of the chosen platforms.

Using pct_access as a variable to indicate digital connectivity, the time series plot shows that most states had digital connectivity before the pandemic, the level of access reduces after the pandemic is declared and increases after the summer holidays. 

In [None]:
#adapted from https://www.kaggle.com/mhslearner/covid-19-impact-on-digital-learning
tp=df_merged.groupby(["time"])["pct_access"].sum()
fig = px.line(tp, x=tp.index, y="pct_access")

fig.update_layout(plot_bgcolor = 'white', title = 'Total pct_access across all states', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=11150,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )


fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
df_engagement_district = pd.merge(engagement_info, districts_info_edited, on='district_id')
df_merged = pd.merge(df_engagement_district, products_info, left_on='lp_id',right_on="LP ID")
df_merged.drop(columns=['LP ID'],inplace=True)
df_merged.head()

In [None]:
df_merged = df_merged.dropna().reset_index(drop=True)

In [None]:
#the code used is borrowed from https://www.kaggle.com/dmitryuarov/eda-covid-19-impact-on-digital-learning/notebook
st_acсess = df_merged.groupby(['state', 'time']).agg({'pct_access': 'mean'}).reset_index()
st_eng = df_merged.groupby(['state', 'time']).agg({'engagement_index': 'mean'}).reset_index()
loc_acсess = df_merged.groupby(['locale', 'time']).agg({'pct_access': 'mean'}).reset_index()
loc_eng = df_merged.groupby(['locale', 'time']).agg({'engagement_index': 'mean'}).reset_index()
cat_acсess = df_merged.groupby(['primary_function_main', 'time']).agg({'pct_access': 'mean'}).reset_index()
cat_eng = df_merged.groupby(['primary_function_main', 'time']).agg({'engagement_index': 'mean'}).reset_index()

for i in [st_acсess, st_eng, loc_acсess, loc_eng, cat_acсess, cat_eng]:
    i['day_of_week'] = i['time'].dt.dayofweek
    
st_acсess.head(3)

In [None]:
fig = px.line(st_acсess, x="time", y="pct_access", color="state", line_group="state")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of pct_access of all products by states', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=2.7,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )


fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=2.25,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()


## Time series analysis of student engagement with different types of education technology over the course of the pandemic.
There are two periods that show differences in engagement, the period after the pandemic was declared and the period after the summer holidays. The summer holidays presented a significant turning point in many of the variables in relation to the engagement index.

The dataset provided is over the course of 2020. We plot a graph to show the level of engagement pre-pandemic and post pandemic.

During the summer holidays the engagement index hits its lowest point but there is still a little bit of activity going on. It seems to have presented people with the opportunity to decide on which products to use and the resources required. There are also products that began being used over this period such as Hulu and YouTube.

Most states already had access to education technology and were actively interacting with it before the pandemic. However, once the pandemic began, all the states initially experience a decline in engagement with the technologies. This can be explained by a period of adjustment as everyone responded to the pandemic. These technologies were also being used in a hybrid mode where the student attended classes physically and interacted with the technology in the classroom. Remote learning meant the resources for access needed to be made available away from the classroom. After the pandemic and adjustment period, the engagement increases at different rates for all the states.

Every state needed an adjustment period which is normal but better preparedness may have reduced the time it took to adjust and carry on with remote learning. Given that most states were already blending digital learning with daily learning, it is not immediately apparent why most states had better engagement only after the summer holidays. A possible explanation would be that the stimulus paid accelerated the engagement by enabling parents to get the required infrastructure and gave the students time to interact with the infrastructure before resumption.


In [None]:
#adapted from https://www.kaggle.com/mhslearner/covid-19-impact-on-digital-learning
te=df_merged.groupby(["time"])["engagement_index"].sum()
fig = px.line(te, x=te.index, y="engagement_index")

fig.update_layout(plot_bgcolor = 'white', title = 'Average engaement index across all states', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=1150,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )


fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
fig = px.line(st_eng, x="time", y="engagement_index", color="state", line_group="state")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products by states', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=1150,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )


fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()

Each products engagement_index changed variably over the course of the pandemic. We analyze how the top 5 popular products engagement changed over the course of 2020

In [None]:
#top 5 most popular products
df_merged.groupby('Product Name')['pct_access'].mean().sort_values(ascending=False).to_frame('average_access').head(5)

In [None]:
prod_eng = df_merged.groupby(['Product Name','time'])['engagement_index'].mean().reset_index()
fig = px.line(prod_eng, x="time", y="engagement_index", color="Product Name", line_group="Product Name")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=1150,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )


fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()

## Google Docs
The engagement index rose after the pandemic and kept on an upward trend after the summer holidays
## Google Classroom
The engagement index went up after the pandemic and declined after the summer holidays
## Canvas
Engagement increase after the pandemic and kept increasing after summer
## YouTube
The data for YouTube is available beginning in the summer holidays and has an upward trajectory after the summer holidays
## Google Drive
The engagement index rose after the pandemic but declined after the summer holidays

## Impact of Policy or Intervention on the rate of engagement.

We plot an initial causal graph usisng a subset of the data (100,000 records) to explore the factors that directly affect the engagement index. The graph becomes more stable as the data points are increased but also requires higher computation power. The factors that directly affect engagement have remained the same over the three graphs that were plotted. We fit a conditional model and check the change in the engagement index when and intervention is palced on the variable pct_free/reduced which is the percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data. We will use this variable to represent a policy.
With the variables directly affecting engagement, we fit a bayesian network and introduce an intervention on the pct_free/reduced variable. The goal is to analyse the rate of engagement when the policy is there and when the policy is not there.

Using the pct_free/reduced variable as the proxy for policy shows no change in engagement whether the policy is there or not. This can be explained by the fact that there was an increase in remote learning and most students did not have access to the free meals.

We introduce a new variable to represent policy, that directly affected people, from a different dataset, https://www.kaggle.com/steshykibika/digital-learning/edit/run/73803122. The data contains the time when government paid out EIPs, the number of people legible for the EIPs in each state and the amount paid out in each state. Soon after the EIPs were issued the trend of the level of engagement remains the same but significantly increases after the summer holidays. These EIPs may have been used to purchase necessary gadgets and the summer holidays might have given students and schools time to create new infrastructure that caused the increase in engagement after the summer holidays.

This section applies the causalnex package and the implementation steps provided in https://causalnex.readthedocs.io/en/latest/

In [None]:
df_merged.head()


In [None]:
df_merged.shape

In [None]:
causal_data = df_merged[['pct_access', 'engagement_index', 'locale', 
                         'pct_black/hispanic', 'pct_free/reduced',
                        'pp_total_raw', 'Provider/Company Name',
                        'Sector(s)', 'primary_function_main',
                        'primary_function_sub']].copy()
causal_data.head()

In [None]:
le = preprocessing.LabelEncoder()
causal_data[['locale', 'Provider/Company Name', 'Sector(s)', 'primary_function_main',
       'primary_function_sub']] = causal_data[['locale', 'Provider/Company Name', 'Sector(s)', 'primary_function_main',
       'primary_function_sub']].apply(le.fit_transform)
causal_data.head()

In [None]:
col_dict = {'pct_black/hispanic': 'pct_black_hispanic',
        'pct_free/reduced': 'pct_free_reduced',
        'Provider/Company Name': 'Provider_Company_Name',
           'Sector(s)': 'Sector'}
causal_data.rename(columns=col_dict,
          inplace=True)
causal_data.head()

In [None]:
causal_data.shape

In [None]:
sm_1000 = from_pandas(causal_data.iloc[:1000,:], w_threshold=0.8)
viz = plot_structure(
    sm_1000,
    graph_attributes={"scale": "2.0", 'size':2.5},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
sm_100000 = from_pandas(causal_data.iloc[:100000,:], w_threshold=0.8)
# sm_100000.remove_edge('primary_function_sub', 'locale')
viz = plot_structure(
    sm_100000,
    graph_attributes={"scale": "2.0", 'size':2.5},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
def Jaccar_score(g, h):    
    i = set(g).intersection(set(h))
    u = set(g).union(set(h))
    return len(i) / float(len(u))

In [None]:
Jaccar_score(sm_1000.edges(), sm_100000.edges())

In [None]:
# sm_500000 = from_pandas(causal_data.iloc[:500000,:], w_threshold=0.8)
# viz = plot_structure(
#     sm_500000,
#     graph_attributes={"scale": "2.0", 'size':2.5},
#     all_node_attributes=NODE_STYLE.WEAK,
#     all_edge_attributes=EDGE_STYLE.WEAK)
# Image(viz.draw(format='png'))

In [None]:
# Jaccar_score(sm_100000.edges(), sm_500000.edges())
#= 0.5588235294117647

Discretize the data for input in Bayesian model

In [None]:
locale_map = {'City': 0, 'Rural': 1, 'Suburb': 2, 'Town': 3}
Provider_map = {' A&E Television Networks, LLC': 0,
 ' Autodesk, Inc': 1,
 ' Tes Global Ltd ': 2,
 'ABC digital': 3,
 'ABCya.com, LLC': 4,
 'Achieve3000': 5,
 'Actively Learn': 6,
 'Adobe Inc': 7,
 'Adobe Inc.': 8,
 'Age of Learning, Inc ': 9,
 'Amazon.com, Inc. ': 10,
 'Amplify Education, Inc.': 11,
 'Answers': 12,
 'Association for Supervision and Curriculum Development': 13,
 'Bartleby': 14,
 'BetterLesson': 15,
 'Big Ideas Learning': 16,
 'Blackboard Inc': 17,
 'Blindside Networks': 18,
 'Blooket LLC': 19,
 'Boom Learning (a dba of Omega Labs Inc.)': 20,
 'Box': 21,
 'BrainPOP LLC': 22,
 'Brainly': 23,
 'Breakout, Inc': 24,
 'CK-12 Foundation': 25,
 'Cable News Network': 26,
 'Calculator.com': 27,
 'Calendly': 28,
 'Canva': 29,
 'Canvas Talent, Inc. ': 30,
 'Capstone': 31,
 'Cengage Learning': 32,
 'Chegg': 33,
 'Cisco': 34,
 'ClassDojo, Inc.': 35,
 'ClassLink': 36,
 'Clever': 37,
 'Code.org': 38,
 'CommonLit': 39,
 'Constructive Media': 40,
 'ContentKeeper Technologies': 41,
 'CoolMath.com LLC': 42,
 'Course Hero': 43,
 'Cult of Pedagogy': 44,
 'Curiosity Media Inc': 45,
 'Curriculum Associates': 46,
 'DeltaMath': 47,
 'Desmos': 48,
 'Dictionary.com': 49,
 'Didax Education': 50,
 'Discovery Communications': 51,
 'Discovery Education': 52,
 'Disney': 53,
 'DocuSign Inc': 54,
 'Doodle Ltd': 55,
 'DotDash': 56,
 'Dreambox Learning': 57,
 'Dropbox': 58,
 'Duolingo': 59,
 'EBSCO Industries, Inc': 60,
 'EDpuzzle Inc.': 61,
 'Edgenuity Inc.': 62,
 'Edmentum': 63,
 'Education.com': 64,
 'Educational Testing Service': 65,
 'Edulastic': 66,
 'Ellevation': 67,
 'Enchanted Learning': 68,
 'Encyclopaedia Britannica, Inc.': 69,
 'Enotes.com': 70,
 'Epic Creations, Inc.': 71,
 'EqsQuest Ltd': 72,
 'Eventbrite': 73,
 'Evite': 74,
 'ExploreLearning, LLC': 75,
 'Facebook': 76,
 'Flipgrid': 77,
 'Flippity': 78,
 'Flocabulary': 79,
 'Formative': 80,
 'Frontline Education': 81,
 'Funbrain Holdings, LLC': 82,
 'Future US Inc': 83,
 'Gale Cengage': 84,
 'Generation Genius, Inc.': 85,
 'GeoGebra': 86,
 'Gimkit': 87,
 'GitHub': 88,
 'GloWorld': 89,
 'Global Compliance Network Inc': 90,
 'GoGuardian': 91,
 'Goodreads': 92,
 'Google LLC': 93,
 'Grammarly': 94,
 'Hapara': 95,
 'HealthTeacher': 96,
 'Heinemann, a division of Greenwood Publishing Group LLC': 97,
 'Hewlett-Packard': 98,
 'Hobsons': 99,
 'Hooda Math': 100,
 'Houghton Mifflin Harcourt': 101,
 'HowStuffWorks': 102,
 'Hulu, LLC': 103,
 'IAC': 104,
 'ITHAKA': 105,
 'IXL Learning': 106,
 'Imagine Easy Solutions': 107,
 'Imagine Learning': 108,
 'Infinite Campus': 109,
 'InnerSloth': 110,
 'Instagram': 111,
 'Instructure, Inc. ': 112,
 'Issuu': 113,
 'Istation': 114,
 'Jakub Koziol': 115,
 'John Wiley and Sons Inco': 116,
 'KQED': 117,
 'Kahoot! AS': 118,
 'Kaleido AI GmbH': 119,
 'Kami Limited': 120,
 'Khan Academy': 121,
 'Lakeshore Learning Materials': 122,
 'Lazel Inc.': 123,
 'LearnZillion': 124,
 'Learning A-Z': 125,
 'Legends of Learning': 126,
 'Lexia Learning': 127,
 'Library of Congress': 128,
 'LinkedIn': 129,
 'LitCharts LLC': 130,
 'LogMeIn': 131,
 'Loom, Inc': 132,
 'Lumen Learning': 133,
 'MIND Research Institute': 134,
 'MIT Media Lab': 135,
 'MakeMusic, Inc.': 136,
 'MarketWatch': 137,
 'Massachusetts Institute of Technology': 138,
 'MasteryConnect': 139,
 'Math Playground': 140,
 'Math Worksheets 4 Kids': 141,
 'Mathsisfun.com': 142,
 'Mathway': 143,
 'McGraw-Hill PreK-12': 144,
 'Measurement Incorporated ': 145,
 'Merriam-Webster': 146,
 'Michael Dayah': 147,
 'Microsoft': 148,
 'Microsoft Education': 149,
 'MobyMax': 150,
 'Multiplication.com': 151,
 'Mystery Science, Inc': 152,
 'NASA': 153,
 'National Center for Families Learning': 154,
 'Nature America, Inc': 155,
 'Nearpod Inc.': 156,
 'Netflix': 157,
 'Neuron Fuel': 158,
 'New York State Education Department': 159,
 'Newsela': 160,
 'Nitrolabs Limited': 161,
 'NoRedInk': 162,
 'NoodleTools': 163,
 'Online Writing Lab': 164,
 'OverDrive': 165,
 'PBS': 166,
 'Padlet': 167,
 'Pandora Media, LLC': 168,
 'Panorama Education': 169,
 'Pear Deck': 170,
 'Performance Matters': 171,
 'Pew Research Center ': 172,
 'Physics Classroom, LLC': 173,
 'Pixlr': 174,
 'PowerSchool Group LLC': 175,
 'Prezi Inc.': 176,
 'ProQuest': 177,
 'Purch': 178,
 'Qualtrics': 179,
 'Quaver Music': 180,
 'Quill.org': 181,
 'Quizizz': 182,
 'Quizlet': 183,
 'Quora': 184,
 'Read Theory': 185,
 'ReadWorks': 186,
 'ReadWriteThink.org': 187,
 'Remind101': 188,
 'Renaissance Learning': 189,
 'Renaissance Learning, Inc.': 190,
 'RoomRecess.com': 191,
 'SAG-AFTRA Foundation': 192,
 'SMARTeacher Inc.': 193,
 'Sandbox Networks': 194,
 'Savvas Learning Company | Formerly Pearson K12 Learning': 195,
 'Sched LLC': 196,
 'Scholastic Inc': 197,
 'School Loop': 198,
 'School Specialty Inc': 199,
 'SchoolTube': 200,
 'Schoology': 201,
 'Screencast-O-Matic': 202,
 'Screencastify, LLC': 203,
 'Securly Inc ': 204,
 'Seesaw Learning Inc': 205,
 'SharpSchool': 206,
 'Shmoop University, Inc': 207,
 'Showbie Inc ': 208,
 'SignUpGenius': 209,
 'SlidesCarnival': 210,
 'Snap Inc.': 211,
 'SoundCloud': 212,
 'Southern Poverty Law Center': 213,
 'SparkNotes': 214,
 'Spotify Ltd': 215,
 'Spotify USA Inc': 216,
 'Starfall Education': 217,
 'Studies Weekly': 218,
 'Study.com': 219,
 'StudyPad Inc.': 220,
 'SurveyMonkey': 221,
 'TEACHERSPAYTEACHERS': 222,
 'TED Conferences': 223,
 'Teach TCI': 224,
 'TeachMe': 225,
 'Teaching.com': 226,
 'Technological Solutions, Inc. (TSI)': 227,
 'Texthelp, Inc.': 228,
 'The College Board': 229,
 'The Common Application, Inc.': 230,
 'The Internet Archive': 231,
 'The Math Learning Center': 232,
 'The New York Times': 233,
 'The Pennsylvania State Universtity': 234,
 'The University of Utah ': 235,
 'The Wikimedia Foundation': 236,
 'ThingLink': 237,
 'Time USA, LLC ': 238,
 'Tools for Schools, Inc. (Book Creator)': 239,
 'Toy Theater': 240,
 'TumbleBooks': 241,
 'Tumblr ': 242,
 'Turnitin': 243,
 'TurtleDiary LLC': 244,
 'TypingClub': 245,
 'US Geological Survey': 246,
 'US Holocaust Museum': 247,
 'United States National Archives': 248,
 'University of Colorado': 249,
 'Utah Education Network': 250,
 'Vector Solutions': 251,
 'Vespr': 252,
 'ViewPure': 253,
 'Vimeo': 254,
 'Vitzo Ltd': 255,
 'VocabularySpellingCity': 256,
 'Vooks, Inc.': 257,
 'Wakelet': 258,
 'Washington Post': 259,
 'WeAreTeachers': 260,
 'WeVideo, Inc.': 261,
 'Weebly': 262,
 'West Corporation': 263,
 'Whiteboard.fi': 264,
 'Wistia': 265,
 'Wix.com, Inc': 266,
 'WordPress': 267,
 'WordReference.com': 268,
 'World Book, Inc': 269,
 'World Wildlife Fund': 270,
 'WyzAnt': 271,
 'XtraMath': 272,
 'Yegros Educational LLC DBA Conjuguemos': 273,
 'ZOOM VIDEO COMMUNICATIONS, INC.': 274,
 'Zearn': 275,
 'Zendesk': 276,
 'iCivics Inc': 277,
 'iHeartRadio': 278,
 'iStockphoto LP': 279,
 'mrdonn.org': 280,
 'musictheory.net': 281,
 'online-stopwatch.com': 282}
sector_map = {'Corporate': 0,
 'Higher Ed; Corporate': 1,
 'PreK-12': 2,
 'PreK-12; Higher Ed': 3,
 'PreK-12; Higher Ed; Corporate': 4}
primary_function_main_map = {'CM': 0, 'LC': 1, 'LC/CM/SDO': 2, 'SDO': 3}
primary_function_sub_map = {'Admissions, Enrollment & Rostering': 0,
                         'Career Planning & Job Search': 1,
                         'Classroom Engagement & Instruction': 2,
                         'Content Creation & Curation': 3,
                         'Courseware & Textbooks': 4,
                         'Data, Analytics & Reporting': 5,
                         'Digital Learning Platforms': 6,
                         'Environmental, Health & Safety (EHS) Compliance': 7,
                         'Human Resources': 8,
                         'Large-Scale & Standardized Testing': 9,
                         'Learning Management Systems (LMS)': 10,
                         'Online Course Providers & Technical Skills Development': 11,
                         'Other': 12,
                         'School Management Software': 13,
                         'Sites, Resources & Reference': 14,
                         'Study Tools': 15,
                         'Teacher Resources': 16,
                         'Virtual Classroom': 17}

In [None]:
discretised_data = causal_data.iloc[:100000,:].copy()

In [None]:
discretised_data["locale"] = discretised_data["locale"].map({y:x for x,y in locale_map.items()})
discretised_data["Provider_Company_Name"] = discretised_data["Provider_Company_Name"].map({y:x for x,y in Provider_map.items()})
discretised_data["Sector"] = discretised_data["Sector"].map({y:x for x,y in sector_map.items()})
discretised_data["primary_function_main"] = discretised_data["primary_function_main"].map({y:x for x,y in primary_function_main_map.items()})
discretised_data["primary_function_sub"] = discretised_data["primary_function_sub"].map({y:x for x,y in primary_function_sub_map.items()})

In [None]:
data_vals = {col: causal_data[col].unique() for col in causal_data.columns}
for i in discretised_data.columns[[0,1,3,4,5]]:
  map  = {v: 'low' if v <= (discretised_data[str(i)].max()-discretised_data[str(i)].min())/2
            else 'high' for v in data_vals[str(i)]}
  discretised_data[str(i)] = discretised_data[str(i)].map(map)

In [None]:
discretised_data.head()

Train Bayesian Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from causalnex.inference import InferenceEngine
from causalnex.evaluation import roc_auc
from causalnex.evaluation import classification_report
from causalnex.network import BayesianNetwork

In [None]:
bn = BayesianNetwork(sm_100000)

In [None]:
bn = bn.fit_node_states(discretised_data)

In [None]:
bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2")

In [None]:
ie = InferenceEngine(bn)

In [None]:
print("distribution before do", ie.query()["pct_free_reduced"])
ie.do_intervention("pct_free_reduced",
                   {'low': 1.0,
                    'high': 0.0})
print("distribution after do", ie.query()["pct_free_reduced"])

In the original distribution, 55% of the data had low rollout of free food programs. We include an intervention to make this 100% low rollout of free food programs.

In [None]:
#What if there was low free or reduced fee on meals all through?

print("marginal engagement", ie.query()["engagement_index"])
ie.do_intervention("pct_free_reduced",
                   {'low': 1.0,
                    'high': 0.0})
print("updated marginal engagement", ie.query()["engagement_index"])

If the rate at which free meals were provided was 100% low as opposed to 55% from the original distribution,
the level of engagement would remain the same. This can be explained by the fact that in 2020 most students were learning remotely and had no access to the free meals provided in schools. Looking at the impact on engagement after the stimulus payment from the time series analysis, there was the summer holiday and the engagement started to increase after the summer holidays.

We will add another variable from a different dataset to test the impact of policy on engagement. For this second case we will analyse the trends in engagement after a the EIPs were paid. The data on EIPs is collected from https://www.imf.org/en/Topics/imf-and-covid19/Policy-Responses-to-COVID-19 

## Time series analysis of engagement after an intervention
The summer holidays had an impact on the trends seen in engagement with most states recording higher engagement rates after the holidays. EIPs were paid before the holidays began and may have been one of the confounding variables leading to increase in engagement. This is because it may have given families that had their income affected by the pandemic a chance to purchase gadgets for remote learning.

In [None]:
fig = px.line(st_eng, x="time", y="engagement_index", color="state", line_group="state")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products by states', 
                  title_font_family = 'monospace', title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)


fig.add_vline(x = '2020-05-22', line_width = 3, line_color="green")

fig.add_annotation(
        x='2020-05-22',
        y=1.5,
        text="Government pays EIPs",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()