# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import glob

from tqdm.notebook import tqdm
tqdm.pandas()

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True) 
import plotly.graph_objs as go

from plotnine import *

%matplotlib inline

# Loading Datasets

In [None]:
input_dir = "../input/learnplatform-covid19-impact-on-digital-learning"

In [None]:
!ls $input_dir

### Districts Information Data

The district file `districts_info.csv` includes information about the characteristics of school districts, including data from [NCES](https://nces.ed.gov/) (2018-19), [FCC](https://www.fcc.gov/) (Dec 2018), and [Edunomics Lab](https://edunomicslab.org/). In this data set, we removed the identifiable information about the school districts. We also used an open source tool [ARX](https://arx.deidentifier.org/) [(Prasser et al. 2020)](https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2812) to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset. 

| Name | Description |
| :--- | :----------- |
| district_id | The unique identifier of the school district |
| state | The state where the district resides in |
| locale | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See [Locale Boundaries User's Manual](https://eric.ed.gov/?id=ED577162) for more information. |
| pct_black/hispanic | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data |
| pct_free/reduced | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data |
| county_connections_ratio | `ratio` (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See [FCC data](https://www.fcc.gov/form-477-county-data-internet-access-services) for more information. |
| pp_total_raw | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district. |

source: README.md

In [None]:
district_df = pd.read_csv(os.path.join(input_dir, "districts_info.csv"))
district_df.shape

In [None]:
district_df.head()

In [None]:
plt.figure(figsize=(20,8))
state_stat = district_df['state'].value_counts()
bar_plot = sns.barplot(x=state_stat.index, y=state_stat)
plt.xlabel('State')
plt.ylabel('Counts')
plt.title("State Freq Chart in District Information data")
t = plt.xticks(rotation=90)

In [None]:
locale_stats = district_df['locale'].value_counts(dropna=False)
fig = go.Figure(data=[go.Pie(labels=locale_stats.index, values=locale_stats, title="Locale Distribution in District Data")])
fig.update_layout(
    font_family="Courier New",
    font_size=18
)
fig.show()

Majority of the district data belongs to suburbial locale

In [None]:
def get_tuples_from_limits(x):
    if isinstance(x, str):
        ll = float(x.split(',')[0][1:])
        ul = float(x.split(',')[1][:-1])
        return (ll, ul)
    return x
    
def get_avg_in_limit(x):
    if isinstance(x, str):
        ll = float(x.split(',')[0][1:])
        ul = float(x.split(',')[1][:-1])
        return (ll + ul)*0.5
    if isinstance(x, tuple):
        return (x[0] + x[1])*0.5
    return x

In [None]:
interval_cols = ['pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio', 'pp_total_raw']

for col in interval_cols:
    district_df[col] = district_df[col].apply(get_tuples_from_limits)
    
district_df['pp_total_raw_mean'] = district_df['pp_total_raw'].apply(get_avg_in_limit)

In [None]:
local_pp_total_median = district_df.groupby(['locale'])['pp_total_raw_mean'].median()
local_pp_total_mean = district_df.groupby(['locale'])['pp_total_raw_mean'].mean()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
fig.suptitle('pp_total_raw average for different locale')
sns.barplot(x=local_pp_total_median.index, y=local_pp_total_median, ax=ax1)
ax1.set_ylabel('local_pp_total_median')
sns.barplot(x=local_pp_total_mean.index, y=local_pp_total_mean, ax=ax2)
ax2.set_ylabel('local_pp_total_mean')

### Product information data

The product file `products_info.csv` includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

| Name | Description |
| :--- | :----------- |
| LP ID| The unique identifier of the product |
| URL | Web Link to the specific product |
| Product Name | Name of the specific product |
| Provider/Company Name | Name of the product provider |
| Sector(s) | Sector of education where the product is used |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled |

Source: README.md

In [None]:
product_df = pd.read_csv(os.path.join(input_dir, "products_info.csv"))
product_df.shape

In [None]:
product_df.head()

In [None]:
def get_product_fn_cat(x):
    if isinstance(x, str):
        return x.split('-')[0].strip()
    return x

def get_product_fn_subcat(x):
    if isinstance(x, str):
        return x.split('-')[1].strip()
    return x

product_df['product_fn_category'] = product_df['Primary Essential Function'].apply(get_product_fn_cat)
product_df['product_fn_sub_category'] = product_df['Primary Essential Function'].apply(get_product_fn_subcat)

In [None]:
product_df.head()

In [None]:
product_fn_cat_stat = product_df['product_fn_category'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=product_fn_cat_stat.index, 
    values=product_fn_cat_stat,
    title="Product function category distribution in products data")])
fig.update_layout(
    font_family="Courier New",
    font_size=18
)
fig.show()

Predominantly `Learning & Curriculum` based products

In [None]:
product_fn_sub_cat_stat = product_df['product_fn_sub_category'].value_counts()

plt.figure(figsize=(15,8))
bar_plot = sns.barplot(x=product_fn_sub_cat_stat, y=product_fn_sub_cat_stat.index, orient='h')
plt.xlabel('Counts')
plt.ylabel('Subcategory of Product Primary Function')
plt.title("Subcategory of Product Primary Function Distribution")
t = plt.xticks(rotation=90)

In [None]:
product_sector_stat = product_df['Sector(s)'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=product_sector_stat.index, 
    values=product_sector_stat,
    title="Product sector distribution in products data")])
fig.update_layout(
    font_family="Courier New",
    font_size=18
)
fig.show()

##### Top 5% companies (as for the number of products provided)

In [None]:
N = 20

company_stat = product_df[['Provider/Company Name', 'Product Name']].drop_duplicates()['Provider/Company Name'].value_counts()[:N]

plt.figure(figsize=(15,8))
bar_plot = sns.barplot(x=company_stat, y=company_stat.index, orient='h')
plt.xlabel('Number of Products')
plt.ylabel('Company/Provider Name')
plt.title("Number of Products provided by a company/provider")
t = plt.xticks(rotation=90)

### Engagement data
The engagement data are aggregated at school district level, and each file in the folder `engagement_data` represents data from one school district. The 4-digit file name represents `district_id` which can be used to link to district information in `district_info.csv`. The `lp_id` can be used to link to product information in `product_info.csv`.

| Name | Description |
| :--- | :----------- |
| time | date in "YYYY-MM-DD" |
| lp_id | The unique identifier of the product |
| pct_access | Percentage of students in the district have at least one page-load event of a given product and on a given day |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day |

source: README.md

In [None]:
engagement_df_example = pd.read_csv(os.path.join(input_dir, "engagement_data", "1000.csv"))
engagement_df_example.head()

In [None]:
engagement_df_example['time'].min(), engagement_df_example['time'].max()

1 year of engagement data seems to be present for a typical product

## Joining Data

Code reference from [here](https://www.kaggle.com/ruchi798/learnplatform-covid-19-impact-on-digital-learning)

In [None]:
engagement_data_path = os.path.join(input_dir, "engagement_data")
all_files = glob.glob(engagement_data_path + "/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.shape

In [None]:
engagement_df.head()

In [None]:
products_engagement_data = pd.merge(product_df, engagement_df, left_on='LP ID', right_on='lp_id')
products_engagement_data.head()

In [None]:
engagement_df["district_id"] = engagement_df["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(district_df, engagement_df, left_on='district_id', right_on='district_id')
districts_engagement_data.head()

In [None]:
products_engagement_data.shape, districts_engagement_data.shape

In [None]:
products_engagement_data['time'] = pd.to_datetime(products_engagement_data['time'])
districts_engagement_data['time'] = pd.to_datetime(districts_engagement_data['time'])

In [None]:
products_engagement_data['month'] = products_engagement_data['time'].dt.month
districts_engagement_data['month'] = districts_engagement_data['time'].dt.month

Trends of engagement of a product on an average demographics

Getting top 20 products on per day `pct_access` basis

In [None]:
top_20_products = products_engagement_data[['Product Name', 'pct_access']]\
        .groupby(['Product Name'])['pct_access'].mean()\
        .sort_values(ascending=False).index[:20].tolist()
top_20_products

In [None]:
plot_df = products_engagement_data[products_engagement_data['Product Name'].isin(top_20_products)]\
                .reset_index(drop=True)[['Product Name', 'time', 'pct_access']]

plot_df.head()

In [None]:
time_series_df = plot_df.pivot_table(index='time', columns='Product Name', values='pct_access')
fig = px.area(time_series_df, facet_col="Product Name", facet_col_wrap=2, width=1600, height=2000)
fig.show()

### Engagement trends on state level

Let's see the engagement for top 10 states

In [None]:
top_10_states = districts_engagement_data[['state', 'pct_access']]\
        .groupby(['state'])['pct_access'].mean()\
        .sort_values(ascending=False).index[:10].tolist()
top_10_states

In [None]:
plot_df = districts_engagement_data[districts_engagement_data['state'].isin(top_10_states)]\
                .reset_index(drop=True)[['state', 'time', 'pct_access']]

plot_df.head()

In [None]:
time_series_df = plot_df.pivot_table(index='time', columns='state', values='pct_access')
fig = px.area(time_series_df, facet_col="state", facet_col_wrap=2, width=1600, height=1000)
fig.show()

# Observations from trends

Common Observations in trends

* There is a clear `weekday` and `weekend` trend, where every engagement dips during the weekend (very intuitive)

* Most engagements flatten out from June to August, probably a vacation

* State Arizona has appreciably high average engagements `pct_access`

* Some products of Google LLC are show predominant engagement compared to other products form other providers

# Writing the joined data

In [None]:
# districts_engagement_data.to_csv("districts_engagement_data.csv", index=False)
# products_engagement_data.to_csv("products_engagement_data.csv", index=False)