In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns 
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap
from IPython.core.display import display, HTML
from folium.plugins import HeatMap, MarkerCluster, FastMarkerCluster

mpl.style.use(['ggplot'])

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Intro

The COVID-19 Pandemic has disrupted learning for more than 56 million students in the United States. In the Spring of 2020, most states and local governments across the U.S. closed educational institutions to stop the spread of the virus. In response, schools and teachers have attempted to reach students remotely through distance learning tools and digital platforms. Until today, concerns of the exacaberting digital divide and long-term learning loss among America’s most vulnerable learners continue to grow. In this notebook, i will try to explore the effect of COVID-19 Pandemic in 2020 on online platform usage, especially on education services.

# The Data

We include three basic sets of files to help you get started. The engagement data are based on LearnPlatform’s Student Chrome Extension. The extension collects page load events of over 10K education technology products in our product library, including websites, apps, web apps, software programs, extensions, ebooks, hardwares, and services used in educational institutions. The engagement data have been aggregated at school district level, and each file represents data from one school district. The product file includes information about the characteristics of the top 372 products with most users in 2020. The district file includes information about the characteristics of school districts, including data from National Center for Education Statistics (NCES), The Federal Communications Commission (FCC), and Edunomics Lab. In addition to the files provided, we encourage you to use other public data sources such as examples listed below.

### District Data

The district file districts_info.csv includes information about the characteristics of school districts, including data from

NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. Steps taken to preserve Privacy.

Identifiable information about the school districts has been removed. An open source tool ARX (Prasser et al. 2020) was used to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

In [None]:
districts_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_info

From the district data, we have 233 rows with 7 columns, and each data consist of :

- district_id: The unique identifier of the school district
- state: The state where the district resides in
- locale: NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information.
- pct_black/hispanic: Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data
- pct_free/reduced: Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data
- countyconnectionsratio: ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.
- pptotalraw: Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.

In [None]:
districts_info.describe(include='all')

In [None]:
plt.figure(figsize=(16, 10))

sns.countplot(y="state",data=districts_info,order=districts_info.state.value_counts().index,palette="pastel",linewidth=3)
plt.title("State Distribution",size=18)

sns.despine()
plt.show()

from the chart above, we know that Connecticut has the Most district representation followed by Utah.

In [None]:
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Locale Type Distribution', size = 20)

labels = list(districts_info.locale.value_counts().index)
sizes = districts_info.locale.value_counts().values
explode = (0, 0, 0, 0.1)

ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#FFFF33","#ff9100","#eaaa00","#6d6875"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

Suburb is the most Locale representation, while Town is the least representation.

In [None]:
plt.figure(figsize=(18,10))

ax=sns.countplot(data=districts_info,x='state',palette='pastel', hue='locale')

plt.xticks(rotation=45)
plt.title("State and its Locality")
plt.legend(loc='upper right')
for p in ax.patches:
    ax.text (p.get_x() + p.get_width()  / 2,p.get_height()+ 0.25,p.get_height(), fontsize = 11)

From the data, we know that :
- there are 23 States and 4 locale type
- with Connecticut has the highest frequency
- Suburb is the highest frequency of locale type while Town is the other way

#### Product Data

The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

In [None]:
products_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_info

From the products data, we have 372 rows with 6 columns, and each data consist of:

- LP ID: The unique identifier of the product
- URL: Web Link to the specific product
- Product Name: Name of the specific product
- Provider/Company Name: Name of the product provider
- Sector(s): Sector of education where the product is used
- Primary Essential Function: The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled

In [None]:
plt.figure(figsize = (16,8))

sns.countplot(data=products_info, y="Provider/Company Name", order=products_info['Provider/Company Name'].value_counts().index[:10], palette='pastel')
plt.title('Top 10 Provider Platform with the Most Product in 2020', size=18)
sns.despine()
plt.show()

Google LLC is a Provider/Company with the most Product, with 30 products and Followed by Microsoft and Hooughton Mifflin Harcourt tied at 6 products. We can see there is no Provider/Company that having products more than 10 other than Google

In [None]:
plt.figure(figsize = (16,8))

sns.countplot(data=products_info, y="Primary Essential Function", order=products_info['Primary Essential Function'].value_counts().index[:], palette='pastel')
plt.title('Distribution of Primary Essential Function of Platform in 2020', size=18)
sns.despine()
plt.show()

From the chart above, LC (Learning & Curriculum) is the most product representations in this data, along with Classroom Management. 

In [None]:
data = products_info.groupby('Sector(s)').count()[['LP ID']].reset_index().sort_values(by="LP ID", ascending=False)

plt.figure(figsize = (16,8))

sns.barplot(data=data, x="Sector(s)", y="LP ID")

plt.title('Products Sector Distribution',size=18)
plt.xlabel('Company Name',size=14)
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
sns.despine()
plt.show()

from the chart above, we have PreK-12 as the most frequency of platform sector from this dataset. PreK-12 is meant for 1st grade to 12th grade students.<br>
Followed by Higher Education sectors too.

#### Engagement Data

The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district. The 4-digit file name represents district_id which can be used to link to district information in district_info.csv. The lp_id can be used to link to product information in product_info.csv.

In [None]:
PATH = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 

temp = []

for district in districts_info.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    temp.append(df)
    
    
engagement = pd.concat(temp)
engagement = engagement.reset_index(drop=True)

In [None]:
engagement

we have 22324190 rows data and 5 columns, which consist :

- time: date in "YYYY-MM-DD"
- lp_id: The unique identifier of the product
- pct_access: Percentage of students in the district have at least one page-load event of a given product and on a given day
- engagement_index: Total page-load events per one thousand students of a given product and on a given day

#### Is There any missing value in the data ?

In [None]:
districts_info.isnull().sum()

In [None]:
products_info.isnull().sum()

In [None]:
engagement.isnull().sum()

Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

# Merging the Data

So now, we need to merging the district, product, and engagement data. After merging the data, we can see more trends or information from the data more easily. <br>
We can merge engagement and district data with district_id, and merge them with products data with lp_id

In [None]:
engagement.engagement_index=engagement.engagement_index.fillna(0)
products_info=products_info.rename(columns = {'LP ID': 'lp_id'}, inplace = False)

In [None]:
df1=engagement.join(districts_info.set_index('district_id'),on='district_id')
df2=df1.join(products_info.set_index('lp_id'),on='lp_id')
df2=df2.dropna(subset=['Product Name'])
df2.shape

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
df2.isnull().sum()

# Analysis

#### **Question :** how's the online platform engagement trend in 2020 ?

In [None]:
df2['time'] = pd.to_datetime(df2['time'], errors='coerce')
df2['month'] = df2['time'].dt.month

In [None]:
engagement_per_month=df2.groupby(['month'], as_index=False)['engagement_index'].mean()
engagement_per_month=engagement_per_month.sort_values(by=['month'],ascending=True)

In [None]:
plt.figure(figsize = (16,8))

sns.lineplot(data=engagement_per_month, x="month", y= "engagement_index", color='b')
plt.title('Monthly Average Engagement in 2020 (All District)', size=18)
plt.xlabel('Month',size=14)

sns.despine()
plt.show()

The chart above tells us about average engagement index monthly. We know that there is significant drop from April to July, but the other way around from July to August. <br>
Since march, where WHO declare about COVID-19 Pandemic, there is slightly increase engagement index. I think summer holiday is related to why engagement index is dropping.

#### **Question :** Which Platform is most Used ?

In [None]:
#get the product name data base on average engagement index and sort it
top_product=df2.groupby(['Product Name', 'Primary Essential Function'], as_index=False)['engagement_index'].mean()
top_product=top_product.sort_values(by=['engagement_index'],ascending=False)

In [None]:
top_product

In [None]:
plt.figure(figsize = (16,8))

sns.barplot(data=top_product.head(10), x="Product Name", y= "engagement_index")

plt.title('Top 10 Product with the Most Average Daily Engagement in 2020 (All District)',size=18)
plt.xlabel('Company Name',size=14)
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
sns.despine()
plt.show()

Google docs is the most used platform in United States 2020. We know that Google docs is essential platform for making documents and we also can colaborate with other people and then followed by Google Classroom.

#### **Question :** Which Platform Category with the most engagement index ?

In [None]:
#get the category data base on average engagement index and sort it
top_category_platform=df2.groupby(['Primary Essential Function'], as_index=False)['engagement_index'].mean()
top_category_platform=top_category_platform.sort_values(by=['engagement_index'],ascending=False)

In [None]:
top_category_platform.head()

In [None]:
plt.figure(figsize = (16,8))

sns.barplot(data=top_category_platform[:10], y="Primary Essential Function", x= "engagement_index")
plt.title('Top 10 Category Platform with the Most Average Daily Engagement in 2020 (All District)', size=18)
sns.despine()
plt.show()

as we can see from chart above, SDO - Learning Management Systems (LMS) is the category with most used in 2020 and followed by LC Category

#### **Question :** how's the trend of engagement index from Learning Management Systems ?

In [None]:
#get the lms data base on average engagement index
lms_engage = df2[df2['Primary Essential Function']=='SDO - Learning Management Systems (LMS)']
lms_engage=lms_engage.groupby(['month'], as_index=False)['engagement_index'].mean()
lms_engage=lms_engage.sort_values(by=['month'],ascending=True)

In [None]:
plt.figure(figsize = (16,8))

sns.lineplot(data=lms_engage, x="month", y= "engagement_index", color='b')
plt.title('Monthly Average Engagement of LMS in 2020 (All District)', size=18)
plt.xlabel('Month',size=14)

sns.despine()
plt.show()

Just like our statement before, since march, where WHO declare about COVID-19 Pandemic, there is slightly increase engagement index. Where summer holiday is related to why engagement index of Learning Management Index is dropping and Increased when holiday is over.

#### **Question :** which states has the most visit to Learning Management Systems Platform in 2020 ?

In [None]:
#get the state data base on average engagement index
state_most_visit_lms = df2[df2['Primary Essential Function']=='SDO - Learning Management Systems (LMS)']
state_most_visit_lms = state_most_visit_lms.groupby(['state'], as_index=False)['engagement_index'].mean()
state_most_visit_lms = state_most_visit_lms.sort_values(by=['engagement_index'],ascending=False)

In [None]:
plt.figure(figsize = (12,6))

sns.barplot(data=state_most_visit_lms.head(5), x="state", y= "engagement_index")

plt.title('Top 5 State that Often Visited Learning Management Systems in 2020',size=18)
plt.xlabel('State',size=14)

locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
sns.despine()
plt.show()

New Hampsphire is the state with most visit to Learning Management Systems in 2020 and closely followed by Arizona and New York.

#### **Question :** which states has the least visit to Learning Management Systems Platform in 2020 ?

In [None]:
plt.figure(figsize = (12,6))

sns.barplot(data=state_most_visit_lms.tail(5), x="state", y= "engagement_index")

plt.title('Top 5 State that the Least Often Visited Learning Management Systems in 2020',size=18)
plt.xlabel('State',size=14)

locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
sns.despine()
plt.gca().invert_xaxis()
plt.show()

as we can see, North Carolina is the state with the least visit to Learning Management Systems in 2020, followed by Missouri and Virginia

#### Visualizing the Engagement Index with Folium and heatmap

In [None]:
from geopy.geocoders import Nominatim

In [None]:
locations=pd.DataFrame({"state":districts_info['state'].unique()})

geolocator=Nominatim(user_agent="app")

#we need to get the latitude and longitude data
lat=[]
lon=[]
for location in locations['state']:
    location = geolocator.geocode(location)    
    if location is None:
        lat.append(np.nan)
        lon.append(np.nan)
    else:
        lat.append(location.latitude)
        lon.append(location.longitude)
        
locations['lat']=lat
locations['lon']=lon

In [None]:
state_engagement = df2.groupby(['state'], as_index=False)['engagement_index'].mean()

#merge the state engagement data with latidude and longitude
final_loc = state_engagement.merge(locations,on='state',how="left").dropna()
final_loc

In [None]:
import folium
from folium import plugins

us_map = folium.Map(location=[38,-97],zoom_start =5, tiles='Stamen Terrain')

HeatMap(final_loc[['lat','lon','engagement_index']],zoom=20,radius=20).add_to(us_map)
average_engagement = plugins.MarkerCluster().add_to(us_map)

for lat, long, label, in zip(final_loc.lat, final_loc.lon, final_loc.engagement_index):
    folium.Marker(
        location=[lat,long],
        icon=None,
        popup=label,
    ).add_to(average_engagement)

us_map