In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_red = ["#331313", "#582626", '#9E1717', '#D35151', '#E9B4B4']
colors_mix = ["#17869E", '#264D58', '#179E66', '#D35151', '#E9DAB4', '#E9B4B4', '#D3B651', '#6351D3']
colors_div = ["#132C33", '#17869E', '#DADADA', '#D35151', '#331313']

sns.palplot(colors_blue)
sns.palplot(colors_dark)
sns.palplot(colors_red)
sns.palplot(colors_mix)
sns.palplot(colors_div)


# 1 - INTRODUCTION


Nelson Mandela believed education was the most powerful weapon to change the world. But not every student has equal opportunities to learn. Effective policies and plans need to be enacted in order to make education more equitable—and perhaps your innovative data analysis will help reveal the solution.

Current research shows educational outcomes are far from equitable. The imbalance was exacerbated by the COVID-19 pandemic. There's an urgent need to better understand and measure the scope and impact of the pandemic on these inequities.

Education technology company LearnPlatform was founded in 2014 with a mission to expand equitable access to education technology for all students and teachers. LearnPlatform’s comprehensive edtech effectiveness system is used by districts and states to continuously improve the safety, equity, and effectiveness of their educational technology. LearnPlatform does so by generating an evidence basis for what’s working and enacting it to benefit students, teachers, and budgets.

This analytics competition expects to uncover trends in digital learning. Accomplish this with data analysis about how engagement with digital learning relates to factors like district demographics, broadband access, and state/national level policies and events.

The submissions will inform policies and practices that close the digital divide. With a better understanding of digital learning trends, you may help reverse the long-term learning loss among America’s most vulnerable, making education more equitable.

# PROBLEM STATEMENT


The COVID-19 Pandemic has disrupted learning for more than 56 million students in the United States. In the Spring of 2020, most states and local governments across the U.S. closed educational institutions to stop the spread of the virus. In response, schools and teachers have attempted to reach students remotely through distance learning tools and digital platforms. Until today, concerns of the exacaberting digital divide and long-term learning loss among America’s most vulnerable learners continue to grow.

# 2 - DATA PREPROCESSING
** PREPARATIONS  **

We are preparing packages and source data that will be used in the analysis process. Python packages that will be used in the analysis mainly are for data manipulation (numpy and pandas) and data visualization (matplotlib and seaborn). 

In [None]:
import numpy as np

import pandas as pd

from sklearn import datasets
import squarify

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker

import seaborn as sns

# 3 - DATA SET OVERVIEW

The overview is prepared to get the feel on data structure. It will also include a quick analysis on missing values, basic statistics and data manipulation. In general there will 3 datasets: engagement, districts and products

# 3.1 - ENGAGEMENT VIEW
        
        The engagement data are aggregated at school district level, and each file represents data from one school district. The 4-digit file name represents district_id which can be used to link to district information in district_info. The lp_id can be used to link to product information in product_info.

This dataset consists of below information:

*time* : date in "YYYY-MM-DD"

*lp_id* : The unique identifier of the product

*pct_access* : Percentage of students in the district have at least one page-load event of a given product and on a given day

*engagement_index* : Total page-load events per one thousand students of a given product and on a given day

Observations:
There are 22,324,190 rows with 5 columns as mentioned above.
This dataset contain missing value of 5,392,397 which come from lp_id of 541, pct_access of 13,447 and engagement_index 5,378,409. Missing value in the engagement_index can be considered big as it consist of 24.15% from total observation.

In [None]:
engagement_data = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/5802.csv')
engagement_data

In [None]:
engagement_data.head()

In [None]:
engagement_data.info()

In [None]:
print(f'Number of rows: {engagement_data.shape[0]};  Number of columns: {engagement_data.shape[1]}; No of missing values: {sum(engagement_data.isna().sum())}')

In [None]:
print('Number of missing Values in every column:')
print(engagement_data.isna().sum())

# 3.1.1 ENGAGEMENT BASICS STATISTICS

Below is the basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.


In [None]:
engagement_data.describe()

In [None]:
train_null = (engagement_data.isnull().sum() / len(engagement_data) * 100).sort_values(ascending=False)
train_notnull = (engagement_data.notnull().sum() / len(engagement_data) * 100).sort_values()

fig, ax = plt.subplots(figsize=(14, 8))

bars1 = ax.bar(x=train_null.index, height=train_null.values, color=colors_red[0])
bars2 = ax.bar(x=train_notnull.index, height=train_notnull.values, bottom=train_null.values, alpha=0.3, color=colors_dark[-1])

for bar in bars1: 
    height = bar.get_height()
    x = bar.get_x()
    
    ax.text(
        x=x+0.4, y=height + 2.5,
        ha='center',
        s="{:.2f}%".format(height),
        fontsize=12,
        color=colors_dark[0],
    )

ax.legend(["Missing values (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Attribute", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])


xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()

plt.text(s="About The Data | District", ha='left', x=xmin, y=ymax*1.19, fontsize=24, color=colors_dark[0])
plt.text(s="Missing values on district data", ha='left', x=xmin, y=ymax*1.12, fontsize=24, fontweight='bold', color=colors_dark[0])
plt.title("All column besides district_id have a missing values\nbut since this notebook is only for analysis and not modelling, we will not attend to the missing values", loc='left', fontsize=13, color=colors_dark[2]) 
plt.tight_layout()
plt.show()

# 3.2 DISTRICTS VIEW
The district file includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. In this data set, LearnPlatform removed the identifiable information about the school districts. LearnPlatform also used an open source tool ARX (Prasser et al. 2020) to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

This dataset consists of below information:

*district_id* : The unique identifier of the school district

*state* : The state where the district resides in

*locale* : NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural.

*pct_black/hispanic* : Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data.

*pct_free/reduced* : Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data

countyconnectionsratio: ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version).
pptotalraw: Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource - Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.
Observations:

There are 223 rows with 7 columns as mentioned above.
This dataset contain missing value of 442 which mainly come from pp_total_raw of 115, pct_free/reduced of 85 and county_connections_ratio of 71.


In [None]:
data_districts = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
data_districts

In [None]:
data_districts.head()

In [None]:
data_districts.info()

In [None]:
print(f'Number of rows: {data_districts.shape[0]};  Number of columns: {data_districts.shape[1]}; No of missing values: {sum(data_districts.isna().sum())}')

In [None]:
print('Number of missing Values in every column:')
print(data_districts.isna().sum())

# 3.2.1 - DISTRICTS BASIC STATISTICS

Below is the basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
data_districts.describe()

In [None]:
train_null = (data_districts.isnull().sum() / len(data_districts) * 100).sort_values(ascending=False)
train_notnull = (data_districts.notnull().sum() / len(data_districts) * 100).sort_values()

fig, ax = plt.subplots(figsize=(14, 8))

bars1 = ax.bar(x=train_null.index, height=train_null.values, color=colors_red[0])
bars2 = ax.bar(x=train_notnull.index, height=train_notnull.values, bottom=train_null.values, alpha=0.3, color=colors_dark[-1])

for bar in bars1: 
    height = bar.get_height()
    x = bar.get_x()
    
    ax.text(
        x=x+0.4, y=height + 2.5,
        ha='center',
        s="{:.2f}%".format(height),
        fontsize=12,
        color=colors_dark[0],
    )

ax.legend(["Missing values (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Attribute", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])


xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()

plt.text(s="About The Data | District", ha='left', x=xmin, y=ymax*1.19, fontsize=24, color=colors_dark[0])
plt.text(s="Missing values on district data", ha='left', x=xmin, y=ymax*1.12, fontsize=24, fontweight='bold', color=colors_dark[0])
plt.title("All column besides district_id have a missing values\nbut since this notebook is only for analysis and not modelling, we will not attend to the missing values", loc='left', fontsize=13, color=colors_dark[2]) 
plt.tight_layout()
plt.show()

# 3.3 products
The product file includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by LearnPlatform team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

This dataset consists of below information:

*LP ID* : The unique identifier of the product

*URL* : Web Link to the specific product

*Product Name* : Name of the specific product

*Provider/Company Name* : Name of the product provider

*Sector(s)* : Sector of education where the product is used

*Category*: The basic function of the product. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations.
Subcategory: Each of these categories have multiple sub-categories with which the products were labeled
Observations:

There are 372 rows with 7 columns as mentioned above.
This dataset contain missing value of 61 which mainly come from Sectors(s), Category, Subcategory with each of them has 20 missing values and 1 missing value on Provider/Company Name.

In [None]:
data_products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
data_products

In [None]:
data_products.head()

In [None]:
data_products.info()

In [None]:
print(f'Number of rows: {data_products.shape[0]};  Number of columns: {data_products.shape[1]}; No of missing values: {sum(data_products.isna().sum())}')

In [None]:
print('Number of missing Values in every column:')
data_products.isnull().sum()

# 3.3.1 PRODUCTS BASIC STATICS
Below is the basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
data_products.describe()

In [None]:
train_null = (data_products.isnull().sum() / len(data_products) * 100).sort_values(ascending=False)
train_notnull = (data_products.notnull().sum() / len(data_products) * 100).sort_values()

fig, ax = plt.subplots(figsize=(14, 8))

bars1 = ax.bar(x=train_null.index, height=train_null.values, color=colors_red[0])
bars2 = ax.bar(x=train_notnull.index, height=train_notnull.values, bottom=train_null.values, alpha=0.3, color=colors_dark[-1])

for bar in bars1: 
    height = bar.get_height()
    x = bar.get_x()
    
    ax.text(
        x=x+0.4, y=height + 2.5,
        ha='center',
        s="{:.2f}%".format(height),
        fontsize=12,
        color=colors_dark[0],
    )

ax.legend(["Missing values (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Attribute", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])


xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()

plt.text(s="About The Data | Products", ha='left', x=xmin, y=ymax*1.19, fontsize=24, color=colors_dark[0])
plt.text(s="Missing values on products data", ha='left', x=xmin, y=ymax*1.12, fontsize=24, fontweight='bold', color=colors_dark[0])
plt.title("There are +/- 5% missing row on column Sector(s) and Primary Essentials Function\nbut since this notebook is only for analysis and not modelling, we will not attend to the missing values", loc='left', fontsize=13, color=colors_dark[2]) 
plt.tight_layout()
plt.show()