In [None]:
%config Completer.use_jedi = False

Data Preparation

In [None]:
# Import Dataset
import pandas as pd
import numpy as np
import os
from glob import glob

from datetime import datetime

from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msno
from wordcloud import WordCloud

engagement_path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/"

In [None]:
# Set Path to Dataset
data_district = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
data_product = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

In [None]:
# Get Full Path to Engagement Data
glob_pattern = os.path.join(engagement_path, '*')
list_engagement = sorted(glob(glob_pattern), key=os.path.getctime)
list_engagement[:5]

In [None]:
# Read All Engagement Data
li = []
for filename in list_engagement:
    df = pd.read_csv(filename, index_col=None, header=0)
    df['district_id'] = filename.split("/")[-1].split(".")[0]
    li.append(df)
    
data_engagements = pd.concat(li, axis=0, ignore_index=True)

# District Data Processing

Data Definition


In [None]:
# Show First 5 row of Disctrict Data
data_district.head()

In [None]:
# Show District Data Columns Name
data_district.columns

In [None]:
# Show District Data Info
data_district.info()

**District Dataset Information**

* Terdapat 233 baris data dan 7 Kolom pada dataset distrik
* Terdapat 1 kolom numerical, sisanya adalah object atau kategorical
* Terdapat *missing values* pada dataset

In [None]:
#Show Disctrict Data Statistical Value
data_district.describe(include='all')

**District Data Describe**

* District id memiliki 233 nilai
* Terdapat 23 *state* pada dataset
* Connecticut merupakan *state* yang sering muncul pada data
* Terdapat 4 *locale* pada dataset
* *Suburb* merupakan *locale* yang sering muncul pada data
* pct_black/hispanic memiliki 5 ratio yang berbeda
* Ratio 0,0.2 memiliki frekuensi kemunculan paling tinggi
* county_connections_ratio memiliki 2 ratio yang berbeda
* Ratio 0.18,1 memiliki frekuensi kemunculan paling tinggi
* pp_total_raw memiliki 11 ratio yang berbeda
* Ratio 8000,10000 memiliki frekuensi kemunculan paling tinggi

In [None]:
#Visualize Missing Value
msno.bar(data_district,color='#4895ef', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

**Missing Values Info**

* District id tidak memiliki *missing values*
* pp_total_raw memiliki *missing values* terbanyak

In [None]:
#Show Null Index Based On pp_total_raw Null Value
data_district[data_district['pp_total_raw'].isnull()]

In [None]:
#Drop Index That 6 Coloumn Contain NAN Value
data_district.dropna(thresh=6, inplace=True)

In [None]:
data_district.isnull().sum()

In [None]:
data_district['locale'].value_counts()

In [None]:
# Create Pie Chart For Locale Columns
data_district['locale'].value_counts().plot(kind='pie',
                                           explode=[0.05, 0.05, 0.05, 0.05],
                                           fontsize=12,
                                           autopct='%3.1f%%',
                                           figsize=(7,7),
                                           startangle=135,
                                           cmap='Wistia')
plt.ylabel("Locale Data")

* Pie Chart menunjukan *suburb* mempunyai persentasi yang lebih besar dibandingkan *Rural*, *City*, dan *Town* dari kolom **locale**

In [None]:
data_district['state'].value_counts()

In [None]:
sns.catplot(data=data_district, y='state', kind='count', height=12, aspect=1)
plt.title("Frequent State")

* Pada gambar diatas, Utah merupakan *state* dengan frekuensi kemunculan terbanyak

In [None]:
sns.displot(data=data_district, y='state', hue='locale', height=10, aspect=0.8)
plt.title("Distribution Between State and Locale")

In [None]:
sns.displot(data=data_district, y='state', hue='locale', col='locale', height=10, aspect=0.8)

* Mayoritas **suburb** berada pada **state** Utah, Illinois, Connecticut, dan Ohio
* Mayoritas **rural** berada pada **state** Connecticut, New York, dan Illinois
* Mayoritas **city** berada pada **state** California, dan Utah
* Mayoritas **town** berada pada **state** Utah

In [None]:
sns.countplot(data=data_district, x='pct_black/hispanic',
             order = data_district['pct_black/hispanic'].value_counts().index,
             hue='locale')
plt.legend(loc="upper right")

In [None]:
# pct_black_hispanic extraction
# Remove Symbols and Separate the number
pct_black_hispanic = data_district['pct_black/hispanic'].str.split(',', n=1, expand=True)
data_district['pct_black'] = pct_black_hispanic[0].str.replace('[','',regex=True)
data_district['hispanic'] = pct_black_hispanic[1].str.replace('[','',regex=True)
data_district['pct_black'] = pd.to_numeric(data_district['pct_black'])
data_district['hispanic'] = pd.to_numeric(data_district['hispanic'])

data_district['pct_black/hispanic_mean'] = (data_district['pct_black'] + data_district['hispanic'])/2

Lakukan ekstraksi pct_black/hispanic menjadi tipe numeric dengan menhitung rata-rata dari angka rasio.

In [None]:

data_district.head()

In [None]:
# Distribution plot for pct_black, pct_hispanic and pct_black/hispanic mean
sns.displot(data=data_district, x='pct_black', hue='locale', kind='kde')
sns.displot(data=data_district, x='hispanic', hue='locale', kind='kde')
sns.displot(data=data_district, x='pct_black/hispanic_mean', hue='locale', kind='kde')

* Gambar diatas menunjukkan pct_black/hispanic mayoritas berada pada **suburb** dan **rural**

In [None]:
# Distribution plot based on state
sns.displot(data=data_district, x='pct_black/hispanic_mean', hue='state', height=8, aspect=.8)

* Gambar di atas menunjukkan mayoritas **state** memiliki persentase pct_black/hispanic yang rendah

In [None]:
sns.countplot(data=data_district, x='pct_free/reduced',
             order = data_district['pct_free/reduced'].value_counts().index,
             hue='locale')
plt.legend(loc="upper right")

In [None]:
# pct_free/reduced extraction
# Remove Symbols and Separate the number
pct_free_reduced = data_district['pct_free/reduced'].str.split(',', n=1, expand=True)
data_district['free'] = pct_free_reduced[0].str.replace('[','',regex=True)
data_district['reduced'] = pct_free_reduced[1].str.replace('[','',regex=True)
data_district['free'] = pd.to_numeric(data_district['free'])
data_district['reduced'] = pd.to_numeric(data_district['reduced'])


data_district['free'].fillna(data_district['free'].mean(), inplace=True)
data_district['reduced'].fillna(data_district['reduced'].mean(), inplace=True)

data_district['pct_free/reduced_mean'] = (data_district['free'] + data_district['reduced'])/2

In [None]:
data_district.head()

In [None]:
sns.displot(data=data_district, x='free', hue='locale', kind='kde')
sns.displot(data=data_district, x='reduced', hue='locale', kind='kde')
sns.displot(data=data_district, x='pct_free/reduced_mean', hue='locale', kind='kde')

* Gambar diatas menunjukkan pct_free/reduced mayoritas berada pada **suburb**

In [None]:
sns.displot(data=data_district, x='pct_free/reduced_mean', hue='state', height=8, aspect=.8)

* Gambar di atas menunjukkan banyak **state** memiliki persentase rata-rata pct_free/reduced

In [None]:
sns.countplot(data=data_district, x='county_connections_ratio',
             order = data_district['county_connections_ratio'].value_counts().index,
             hue='locale')
plt.legend(loc="upper right")

* Plot di atas menunjukkan mayoritas county_connections_ratio berada pada rasio 0.18,1
* **Suburb** memiliki county_connections_ratio yang banyak

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.countplot(ax=ax, data=data_district, x='county_connections_ratio',
             order = data_district['county_connections_ratio'].value_counts().index,
             hue='state')
plt.legend(loc="upper right")

* Mayoritas **state** memiliki persentasi yang rendah pada pendistribusian data county_connections_ratio 

In [None]:
sns.countplot(data=data_district, y='pp_total_raw')
plt.show()

In [None]:
# pct_free/reduced extraction
# Remove Symbols and Separate the number

pp_total_raw = data_district['pp_total_raw'].str.split(",",n=1,expand=True)

data_district['total'] = pp_total_raw[0].str.replace('[','',regex=True)
data_district['raw'] = pp_total_raw[1].str.replace('[','',regex=True)

data_district['total'] = pd.to_numeric(data_district['total'])
data_district['raw'] = pd.to_numeric(data_district['raw'])

data_district['total'].fillna(data_district['total'].mean(), inplace=True)
data_district['raw'].fillna(data_district['raw'].mean(), inplace=True)

data_district['pp_total_raw_mean'] = (data_district['total'] + data_district['raw'])/2

In [None]:
sns.displot(data=data_district, x="pp_total_raw_mean", hue='state', height=8, aspect=.8)
plt.show()

* Gambar di atas menunjukkan mayoritas **state** retdistribusi pp_total_raw

# Product Data Processing

In [None]:
data_product.head()

In [None]:
data_product.info()

In [None]:
data_product.describe(include='all')

* Terdapat 372 jenis **URL** dan **Product Name**
* Terdapat 290 jenis **Provider/Company Name** 
* Terdapat 5 jenis **Sector(s)** 
* Terdata 35 jenis **Primary Essential Function**
* **URL** yang paling sering muncul adalah *https://www.splashmath.com*
* **Product Name** yang memiliki frekuensi kemunculan yang tinggi adalah SplashLearn
* **Provider/Company Name** yang banyak digunakan adalah Google LLC
* **Sector(s)** yang banyak muncul adalah Prek-12
* **Primary Essential Function** yang banyak digunakan adalah LC-Digital Learning Plaforms

In [None]:
msno.bar(data_product,color='#4895ef', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

* Missing Value pada data relatif sedikit
* **Sector(s)** dan **Primary Essential Function** mempunyai missing value yang lebih banyak diantara yang lain

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y='Provider/Company Name', data=data_product, order=data_product["Provider/Company Name"].value_counts().index[:10])
plt.title("Top 10 Provider/Company Names",font="Serif", size=20)
plt.show()

* Dari gambar di atas terlihat Google LLC adalah provider yang sering digunakan
* Selisih Google LLC dibandingkan provider yang lainnya relatif sangat jauh

In [None]:
d1=d2=d3=0
for s in data_product["Sector(s)"]:
    if(not pd.isnull(s)):
        s = s.split(";")
        for i in range(len(s)):
            sub = s[i].strip()
            if(sub == 'PreK-12'): d1+=1
            if(sub == 'Higher Ed'): d2+=1
            if(sub == 'Corporate'): d3+=1

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [d1,d2, d3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#ff228a","#20b1fd","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()


* **Prek-12** memiliki persentase yang lebih tinggi pada distribusi **Sector(s)**

In [None]:
primary_sub = data_product['Primary Essential Function'].str.split("-",n=1,expand=True)
data_product['primary'] = primary_sub[0]
data_product['sub'] = primary_sub[1]

In [None]:
data_product['primary'].value_counts().plot(kind='pie',
                                           explode=[0.05, 0.05, 0.05, 0.05],
                                           fontsize=12,
                                           autopct='%3.1f%%',
                                           figsize=(7,7),
                                           startangle=135,
                                           cmap='Wistia')
plt.ylabel("Primary")

* Mayoritas **Primary Essential Function** adalah LC

In [None]:
sns.catplot(data=data_product, y='sub', kind='count', height=12, aspect=1)

* Gambar di atas menunjukkan **Primary Essential Function** yang sering digunakan adalah Digital Learning Platform

In [None]:
sns.displot(data=data_product, y='Primary Essential Function', hue='Sector(s)', height = 12, aspect=2)
plt.show()

* Plot di ats menunjukkan mayoritas Prek-12 **Sector(s)** banyak terdistribusi dengan LC-Digital Learning

In [None]:
products_name = data_product['Product Name'].unique()
products_name_str = ",".join(products_name)

wordist = WordCloud(width=1000, height=500, random_state=1, background_color='white', collocations=False).generate(products_name_str)
plt.figure(figsize=(15,15))
plt.imshow(wordist)
plt.axis("off")
plt.show()

In [None]:
# providers_name = data_product['Provider/Company Name'].unique()
# providers_name_str = ",".join(products_name)

# wordist = WordCloud(width=1000, height=500, random_state=1, background_color='white', collocations=False).generate(providers_name_str)
# plt.figure(figsize=(15,15))
# plt.imshow(wordist)
# plt.axis("off")
# plt.show()

# Engagement Data Processing

In [None]:
data_engagements.head()

In [None]:
data_engagements = data_engagements[~data_engagements.lp_id.isnull()]
data_engagements = data_engagements[~data_engagements.district_id.isnull()]

In [None]:
data_engagements['time'] = pd.to_datetime(data_engagements['time'])
data_engagements['quarter'] = data_engagements.time.apply(lambda x:x.quarter)
data_engagements['week'] = data_engagements.time.apply(lambda x:x.weekofyear)

In [None]:
data_engagements['lp_id'] = data_engagements['lp_id'].apply(lambda x: str(int(x)))
data_engagements['district_id'] = data_engagements['district_id'].astype('str')

In [None]:
data_engagements.isnull().sum()

In [None]:
data_engagements.dropna(inplace=True)

In [None]:
data_engagements.describe(include='all', datetime_is_numeric=True)

* Timestamp waktu dimulai sejak tanggal 1 Januari 2020

In [None]:
data_engagements.head()

In [None]:
def lineplot(df, agr_col, target_col, title):
    fig, ax1 = plt.subplots(figsize = [15,5])
    ymin = 0
    ymax = df[target_col].max()
    plt.vlines(x=datetime.strptime('2020-02-10', '%Y-%m-%d'), ymin=ymin, ymax=ymax, color = 'blue', lw = 4)
    ax1.fill_between([datetime.strptime('2020-05-15', '%Y-%m-%d'), datetime.strptime('2020-09-15', '%Y-%m-%d')], 0, ymax, alpha = 0.15, color = 'purple')
    ax1 = sns.lineplot(data = df, x = agr_col, y = target_col, color = 'green')
    plt.title(title)
    plt.show()

In [None]:
# Create visualization for total engagements per day
temp_df = data_engagements.groupby('time').agg({'engagement_index':'sum', 'pct_access':'sum'}).reset_index(drop = False)
lineplot(temp_df, 'time', 'engagement_index', 'Total Engagement per day')

* Dari grafik di atas, terlihat terdapat kenaikan **engagement index** dari bulan Februari menuju Maret
* Terjadi penurunan **engagement index** antara bulan may 2020 dan september 2020

In [None]:
def weekly_barplot(df, agr_col, target_col, title):
    fig, ax1 = plt.subplots(figsize = [15,5])
    ymin = 0
    ymax = df[target_col].max()
    ax1 = sns.barplot(data = data, x = agr_col, y = target_col, color = 'dodgerblue')
    ax1.set_box_aspect(10/len(ax1.patches)) #change 10 to modify the y/x axis ratio
    ax1 = plt.vlines(x=5, ymin=ymin, ymax=ymax, color = 'red', lw = 4)
    ax1 = plt.vlines(x=31, ymin=ymin, ymax=ymax, color = 'black', lw = 4)
    plt.title(title);
    plt.show()

In [None]:
temp_df = data_engagements.groupby('week').agg({'engagement_index':'sum', 'pct_access':'mean'}).reset_index(drop = False)
data = temp_df[~temp_df.week.isin([1, 53] + list(range(20, 39)))].sort_values('week')
weekly_barplot(data, 'week', 'engagement_index', 'Weekly Engagement')

In [None]:
temp_df = data_engagements.groupby(by = 'time', as_index = True).agg('mean')

plt.rcParams['figure.figsize']= [15, 8]
start_date = datetime(2020,5,1)
end_date = datetime(2020,9,30)
temp_df[(start_date<=temp_df.index) & (temp_df.index<=end_date)].plot(y='engagement_index',grid=True)
plt.show()

In [None]:
plt.rcParams['figure.figsize']= [15, 8]
temp_df[(start_date<=temp_df.index) & (temp_df.index<=end_date)].plot(y='pct_access',grid=True)
plt.show()

# Engagement Data in District Dataset

In [None]:
# Convert district_id to int64,so the data can be merged with engagement data
data_engagements['district_id'] = data_engagements['district_id'].astype(np.int64)

In [None]:
# Rename LP ID to lp_id to int64,so the data can be merged with engagement data
data_product = data_product.rename({'LP ID': 'lp_id'}, axis=1)

In [None]:
data_product['lp_id'] = data_product['lp_id'].astype(np.int64)

In [None]:
# Merge data_engagements with data_district
engagement_combine = pd.merge(data_engagements,data_district,on=['district_id'])

In [None]:
engagement_combine['lp_id'] = engagement_combine['lp_id'].astype(np.int64)

In [None]:
# Merge with data_product
engagement_combine = pd.merge(engagement_combine,data_product,on=['lp_id'])

In [None]:
engagement_combine.describe(include='all', datetime_is_numeric=True)

In [None]:
engagement_combine.groupby('Primary Essential Function')[['engagement_index']].median().plot(kind='bar', figsize=(15, 7), color=['blue'])
plt.show()

In [None]:
# Plot distribusi Engagement index based on Sector
engagement_combine.groupby('Sector(s)')[['engagement_index']].median().plot(kind='bar', figsize=(15, 7), color=['blue'])
plt.show()

* Sector Corporate memiliki engagement index yang paling tinggi

In [None]:
# Plot distribusi Engagement index based on locale
engagement_combine.groupby(['locale'])[['pct_access']].median().plot(kind='bar', figsize=(15, 7),color=['green'])
plt.show()

* Plural memilki engagement index yang lebih tinggi

In [None]:
engagement_combine.groupby(['state'])[['engagement_index']].median().plot(kind='bar', figsize=(15, 7),color=['green'])
plt.show()

# Conclusion

* What is the picture of digital connectivity and engagement in 2020?
    
    Dari grafik yang terlihat, terjadi perubahan yang signifikan ketika COVID-19 terjadi.

* What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

    Dari data yang ada, pengaruh COVID-19 berakibat pada berubahnya pola belajar secara *offline* menjadi *online*, *Digital Learning Platforms* akan menjadi sarana baru sebagai tempat belajar pada pelajar saat pandemi dan setelah pandemi.
    
* How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

    Berdasarkan data, engagement index tertinggi terdapat pada **locale Rural** dan **Sector Corporate** dan **State North Dakota**
    

    
    