In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode,iplot
import plotly.figure_factory as ff
import glob
import math
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
product = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
district = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement = pd.concat(li)
engagement= engagement.reset_index(drop=True)

In [None]:
print('engagement shape : ',engagement.shape)
print('product shape : ',product.shape)
print('district shape : ',district.shape)

In [None]:
engagement.head()

In [None]:
product.head()

In [None]:
district.head()

## Check Missing Values

In [None]:
print('\033[1m''engagement missing values \n''\033[0m',engagement.isna().sum())
print('\033[1m''product missing values \n''\033[0m',product.isna().sum())
print('\033[1m''district missing values \n''\033[0m',district.isna().sum())

In [None]:
msno.matrix(district,figsize=(8,5))

In [None]:
msno.matrix(product,figsize=(8,5))

In [None]:
msno.bar(engagement,figsize=(8,5))

## EDA

### district data

In [None]:
district['state'].nunique()

In [None]:
sns.countplot(y='state',data=district,order=district['state'].value_counts().index)
sns.set(rc={'figure.figsize':(12,12)})

In [None]:
loc=['Suburb','Rural','City','Town']
data = go.Pie(values = district['state'].value_counts(),
             labels = loc,
             textinfo='label+value+percent')
fig = go.Figure(data=data)
fig.show()

In [None]:
# count of pct_black/hispanic 
sns.countplot(y='pct_black/hispanic',data=district,
             order=district['pct_black/hispanic'].value_counts().index)
sns.set(rc={'figure.figsize':(12,10)})
sns.set_style("white")

In [None]:
stat=district['pct_black/hispanic'].value_counts().index
data = go.Pie(values=district['pct_black/hispanic'].value_counts(),
              labels=stat,
              textinfo='label+value+percent'
             )
fig=go.Figure(data=data)
fig.show()

In [None]:
# count of pct_free/reduced
sns.countplot(y='pct_free/reduced',data=district,
             order=district['pct_free/reduced'].value_counts().index)
sns.set(rc={'figure.figsize':(12,12)})
sns.set_style("white")

In [None]:
stat=district['pct_free/reduced'].value_counts().index
data = go.Pie(values=district['pct_black/hispanic'].value_counts(),
             labels=stat,
             textinfo='label+value+percent')
fig = go.Figure(data=data)
fig.show()

In [None]:
# count of pp_total_raw
sns.countplot(y='pp_total_raw',data=district,
             order=district['pp_total_raw'].value_counts().index)
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style("white")

In [None]:
def state_dist(df,col1,col2):
    fig,ax = plt.subplots(figsize=(15,4))
    plt.xticks(rotation=90)
    plt.suptitle(f'{col2} distribution in each {col1}')
    sns.countplot(data=df,x=col1, hue=col2, order=df[col1].value_counts().index,
                 palette='pastel',orient='v')
    plt.legend(bbox_to_anchor=(1,0),loc='lower left')

In [None]:
state_dist(district,'state','pct_black/hispanic')

In [None]:
state_dist(district,'locale','pct_black/hispanic')

In [None]:
state_dist(district,'state','pct_free/reduced')

In [None]:
state_dist(district,'locale','pct_free/reduced')

In [None]:
state_dist(district,'state','pp_total_raw')

In [None]:
state_dist(district,'locale','pp_total_raw')

### Engagement

In [None]:
status =  engagement.groupby('time')['pct_access','engagement_index'].mean()
status.plot(figsize=(13,7),title='Monthly engagement mean',subplots=True)

### Product

In [None]:
# top10 of provider/company name
sns.countplot(y='Provider/Company Name',data=product,
             order=product['Provider/Company Name'].value_counts().index[:10])
plt.title('Top10 of Provider/Company Name',size=25)
plt.figure(figsize=(8,10))
plt.show()

In [None]:
sns.countplot(y='Sector(s)',data=product,
             order=product['Sector(s)'].value_counts().index)
sns.set(font_scale=1.5)
plt.title('Sector(s)',fontsize=25)
plt.show()

In [None]:
product['Primary Essential Function']

In [None]:
#reference https://www.kaggle.com/ruchi798/covid-19-impact-on-digital-learning-eda-w-b#Reading-data-files-%F0%9F%91%93
# count of LC, CM, SDO
primary_essential_main=[]
primary_essential_sub=[]
for s in product['Primary Essential Function']:
    if(not pd.isnull(s)):
        m = s.split('-',1)[0].strip()
        primary_essential_main.append(m)
        
        sub = s.split('-',1)[1].strip()
        primary_essential_sub.append(sub)
    else:
        primary_essential_main.append(np.nan)
        primary_essential_sub.append(np.nan)

product['P_E_main']= primary_essential_main
product['P_E_sub']=primary_essential_sub

In [None]:
c1=c2=c3=0

for s in product['P_E_main']:
    if(not pd.isnull(s)):
        c1+=s.count('CM')
        c2+=s.count('LC')
        c3+=s.count('SDO')
label=['CM','LC','SDO']
count=[c1,c2,c3]

fig = plt.figure(figsize=(5,5))
plt.title('Primary_Essential_Func')
ax=fig.add_subplot()
pie = ax.pie(count,autopct=lambda x: '{:.2f}'.format(x),labels=label)
plt.show()


In [None]:
sns.countplot(y='P_E_sub',data=product, order = product['P_E_sub'].value_counts().index)
plt.title('Primary_Essential_Sub')
plt.show()

## Merge Datasets

In [None]:
# Merge engagement & district
engagement['time'] = pd.to_datetime(engagement['time'])
print(product['LP ID'].nunique())
print(engagement['lp_id'].nunique())

In [None]:
eng_pro = pd.merge(product,engagement,left_on='LP ID',right_on='lp_id')
eng_pro.head()

In [None]:
print(district['district_id'].nunique())
print(eng_pro['district_id'].nunique())

In [None]:
eng_pro['district_id']=eng_pro['district_id'].astype(str).astype(int)
eng_all = pd.merge(district,eng_pro,left_on='district_id',right_on='district_id')
eng_all.head()

In [None]:
del(eng_pro)

In [None]:
eng_all.isnull().sum()

In [None]:
msno.heatmap(eng_all)

### drop the row if row doesn't have 'state'

In [None]:
eng_all=eng_all[eng_all['state'].notnull()]
eng_all.isnull().sum()

## How does 'student engagement with online learning platforms' relate to Demographic context?

In [None]:
black_hispanic = eng_all.groupby(['pct_black/hispanic','time'])['engagement_index'].mean().reset_index()

In [None]:
import plotly.express as px
layout= go.Layout(title='engagement of each group')
fig=go.Figure(layout=layout)
fig.update_xaxes(showgrid=True,showline=True)
eng_01 = black_hispanic[black_hispanic['pct_black/hispanic']=='[0, 0.2[']
eng_03 = black_hispanic[black_hispanic['pct_black/hispanic']=='[0.2, 0.4[']
eng_05 = black_hispanic[black_hispanic['pct_black/hispanic']=='[0.4, 0.6[']
eng_07 = black_hispanic[black_hispanic['pct_black/hispanic']=='[0.6, 0.8[']
eng_09 = black_hispanic[black_hispanic['pct_black/hispanic']=='[0.8, 1[']

fig.add_trace(go.Scatter(
    x=eng_01['time'],y=eng_01['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='black/hispanic_10%'))
fig.add_trace(go.Scatter(
    x=eng_03['time'],y=eng_03['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='30%'))
fig.add_trace(go.Scatter(
    x=eng_05['time'],y=eng_05['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='50%'))
fig.add_trace(go.Scatter(
    x=eng_07['time'],y=eng_07['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='70%'))
fig.add_trace(go.Scatter(
    x=eng_09['time'],y=eng_09['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='90%'))

#### There is no conspicuous features with black/hispanic.It can be seen that the difference between the high-density and low-density groups of black/hispanic is not large.

### replace 'pct_free/reduced','pct_black/hispanic' value into mean

In [None]:
eng_all.replace('[0, 0.2[','0,1',inplace=True)

In [None]:
eng_all.replace('[0.2, 0.4[','0,3',inplace=True)
eng_all.replace('[0.4, 0.6[','0,5',inplace=True)
eng_all.replace('[0.6, 0.8[','0,7',inplace=True)
eng_all.replace('[0.8, 1[','0,9',inplace=True)

In [None]:
eng_all['pct_free/reduced'].unique()

In [None]:
eng_all['pct_black/hispanic'].unique()

## 'pct_free/reduced' vs engagement

In [None]:
fr = eng_all.groupby(['pct_free/reduced','time'])['engagement_index'].mean().reset_index()

## 왜안되는지 못찾음.

## Diff types of education tech change over the course of pandemic

In [None]:
edutech = eng_all.groupby(['Sector(s)','time'])['engagement_index'].mean().reset_index()

In [None]:
layout= go.Layout(title='change of the sector over the course of pandemic')
fig=go.Figure(layout=layout)
fig.update_xaxes(showgrid=True,showline=True)

Prek = edutech[edutech['Sector(s)']=='PreK-12']
Prek_H_C = edutech[edutech['Sector(s)']=='PreK-12; Higher Ed; Corporate']
Prek_H = edutech[edutech['Sector(s)']=='PreK-12; Higher Ed']
Cor = edutech[edutech['Sector(s)']=='Corporate']
H_C = edutech[edutech['Sector(s)']=='Higher Ed; Corporate']

fig.add_trace(go.Scatter(
    x=Prek['time'],y=Prek['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Prek-12'))
fig.add_trace(go.Scatter(
    x=Prek_H_C['time'],y=Prek_H_C['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Prek12_HigherEd_Corporate'))
fig.add_trace(go.Scatter(
    x=Prek_H['time'],y=Prek_H['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Prek12_HigherEd'))
fig.add_trace(go.Scatter(
    x=Cor['time'],y=Cor['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Corporate'))
fig.add_trace(go.Scatter(
    x=H_C['time'],y=H_C['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='HigherEd_Corporate'))

In [None]:
PEF = eng_all.groupby(['P_E_main','time'])['engagement_index'].mean().reset_index()

In [None]:
layout= go.Layout(title='change of the Primary Essential Function over the course of pandemic')
fig=go.Figure(layout=layout)
fig.update_xaxes(showgrid=True,showline=True)

LC = PEF[PEF['P_E_main']=='LC']
CM = PEF[PEF['P_E_main']=='CM']
SDO = PEF[PEF['P_E_main']=='SDO']

fig.add_trace(go.Scatter(
    x=LC['time'],y=LC['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Learning&Curriculum'))
fig.add_trace(go.Scatter(
    x=CM['time'],y=CM['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='Classroom Management'))
fig.add_trace(go.Scatter(
    x=SDO['time'],y=SDO['engagement_index'].rolling(window=7).mean(),
    mode="lines",line=go.scatter.Line(),name='School&District Operations'))
