# Load libraries

In [None]:
import pandas as pd
import datatable as dt
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style

In [None]:
sns.set()
plt.rc("font", size=12)
plt.rc('axes', titlesize=20, titlepad=20)

# Load data

In [None]:
products_data = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
districts_data = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

In [None]:
count = 0
engage_data = dt.Frame()

for dist in districts_data["district_id"].unique():
    # load data
    df = dt.fread("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/{}.csv".format(dist))
    df["district"] = dist # add district column
    engage_data.rbind(df)
    
    # Tracker
    count += 1
    if count % 50 == 0:
        print(count, "datasets loaded")
        
engage_data = engage_data.to_pandas()
engage_data.head()

# Products data

In [None]:
products_data.info()

In [None]:
# Rename to match the column in engagement data
products_data.rename({"LP ID": "lp_id"}, axis=1, inplace=True)

## Univariate analysis

In [None]:
prod_cols = ["Sector(s)", "Primary Essential Function", "Provider/Company Name"]

### Number of unique values for each feature

In [None]:
unique = [len(products_data.groupby(c)[c].unique()) for c in prod_cols]

plt.figure(figsize=(15,6))
bar = sns.barplot(x=unique, y=prod_cols, palette="Blues")
plt.bar_label(bar.containers[0], padding=5)
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=3)

for idx, c in enumerate(prod_cols):
    products_data.groupby(c)[c].count().sort_values(ascending=False)[:10].plot.barh(ax=ax[idx], figsize=(10, 30))

# Engagement data in relation with Products data

In this step, we try to understand the digital connectivity trend across the top 10 products.

In [None]:
engage_data["engagement_index"] = (engage_data["engagement_index"] / 1000).round(2)
engage_data_cop = engage_data[engage_data["lp_id"].isin(products_data["lp_id"].unique())].copy()

In [None]:
# Mean of page-load events on a day = Mean of page-load events across districts on that day 
d = engage_data_cop.groupby(["lp_id", "time"])["engagement_index"].mean().sort_values().reset_index()

# Mean daily page-load events = Mean of page-load events across days
d = d.groupby(["lp_id"])["engagement_index"].mean().round(2).sort_values().tail(10).reset_index()

# Merge to get product name
d = pd.merge(left=d, right=products_data, left_on="lp_id", right_on="lp_id")

# Plot
plt.figure(figsize=(20, 10))
sns.barplot(data=d, x="engagement_index", y="Product Name", palette="Blues")
plt.title("Mean daily page-load events of Top 10 tools")
plt.xlabel("Mean daily page-load events per one student")
plt.show()

In [None]:
engage_top_prods = pd.merge(left=d.drop("engagement_index", axis=1), 
                            right=engage_data_cop, 
                            left_on="lp_id", 
                            right_on="lp_id").drop("lp_id", axis=1)

# add month column for later aggregation by month
engage_top_prods["month"] = engage_top_prods["time"].dt.month

# Mean page-load events on a day = Mean of page-load events across districts
mean_engage_each_day = engage_top_prods.groupby(["Product Name", "month", "time"])["engagement_index"].mean().reset_index()

# Mean daily page-load events of a month = Mean of page-load events across days of that month
mean_engage_each_month = mean_engage_each_day.groupby(["Product Name", "month"])["engagement_index"].mean().round(2).reset_index()

# plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=mean_engage_each_month, 
        x="month", y="engagement_index", hue="Product Name")
plt.title("Mean daily page-load events across months of Top 10 tools")
plt.legend(bbox_to_anchor=(1.05, 1), title="Product")
plt.ylabel("Mean daily page-load events per one student")
plt.xlabel("Month")
plt.show()

For the top 10 products, the mean number of daily page-load events peak in March - April and September-October. They all drop during the summer.

# Districts

In [None]:
districts_data.info()

### Univariate analysis

In [None]:
cols = ["locale", "pct_black/hispanic", "pct_free/reduced", "county_connections_ratio", "pp_total_raw"]

fig, ax = plt.subplots(nrows=len(cols), ncols=1, figsize=(15, 25))

for i in range(len(cols)):
    df = districts_data[cols[i]].value_counts().sort_values(ascending=True)
    sns.barplot(x=df.values, y=df.index, palette="Blues", ax=ax[i])
    ax[i].set_title(cols[i])
    
fig.tight_layout(pad=3)

As most of the districts fall into one category of county_connections_ratio, this feature will not affect the digital connectivity trend found in this dataset. We may safely disregard this feature.

We also notice significant imbalance in the proportion of locale and percentage of black/hispanic. This leads to a prediction that these two factors may not affect the digital connectivity trend as much as the percentage of free/reduced and the per-pupil expenditure.

Looking into the pp_total_raw feature, we may re-categorize the categories into 3 groups of low, middle and high per-pupil expenditure. We will use KMeans Clustering for this task.

### Data encoding & grouping

In [None]:
districts_copy = districts_data.drop(["county_connections_ratio"], axis=1).dropna()
districts_encode = districts_copy.copy()

# ---------

districts_data["district_id"] = districts_data["district_id"].astype("string")
districts_encode["locale"] = districts_encode["locale"].astype("category").cat.codes

# ---------

encode_cols = ["pp_total_raw"]

for c in encode_cols:
    districts_encode[c] = districts_copy[c].apply(lambda x: x.split(",")[0].split("[")[1])
    
# ---------

from sklearn.cluster import KMeans

for c in encode_cols:
    districts_encode[c] = KMeans(n_clusters=3, random_state=53).fit_predict(districts_encode[[c]])
    
# ---------
    
pct_encode_map = {
    "[0, 0.2[": 0,
    "[0.2, 0.4[": 1,
    "[0.4, 0.6[": 2,
    "[0.6, 0.8[": 3,
    "[0.8, 1[": 4
}

districts_encode.replace(pct_encode_map, inplace=True)
districts_encode.head()

### Correlation between per-pupil total expenditure and other features

Let's examine to see whether there is any connection between the demographic features of the district and the per-pupil expenditure.

In [None]:
districts_corr = districts_encode[districts_encode.columns[1:]].corr()["pp_total_raw"][:].round(2)
plt.figure(figsize=(15,8))
bar = sns.barplot(x=districts_corr.index, y=districts_corr.values, palette="Blues")
plt.bar_label(bar.containers[0], padding=5)
plt.title("Correlation between Per-pupil total expenditure and other features")
plt.show()

The correlation between per-pupil expenditure and each of other background features of the districts, including type of locale, percentage of black/hispanic, and percentage of students qualified for free/reduced lunch, is low.

# Engagement data in relation with district data

In this step, we try to understand whether demographic features affect digital engagement in 2020.

## Compile data

In [None]:
engage_dist_data = pd.merge(left=engage_data, 
                            right=districts_encode, 
                            left_on="district", 
                            right_on="district_id").drop("district_id", axis=1)

engage_dist_data["month"] = engage_dist_data["time"].dt.month

engage_dist_cop = engage_dist_data.dropna(subset=["engagement_index"])

engage_dist_cop.head()

## Engagement Index and Percentage of Access by State

In [None]:
cols = [
    {
        "col_name": "engagement_index",
        "title": "Mean daily page-load events across states",
        "ylabel": "Daily page-load events per one student",
        "method": "mean",
    },
    {
        "col_name": "pct_access",
        "title": "Mean daily percentage of students have at least one-page load event across states",
        "ylabel": "Daily number of students have at least one-page load",
        "method": "max",
    }
]

for c in cols:
    # Take mean across products for engagement_index, take max for pct_access
    engage_state = engage_dist_cop.groupby(["state", "month", "time", "district"])[c["col_name"]].agg([c["method"]]).reset_index()
    
    # Rename
    engage_state.rename({c["method"]: c["col_name"]}, axis=1, inplace=True)
    
    # Take mean across districts
    engage_state = engage_state.groupby(["state", "month", "time"])[c["col_name"]].mean().reset_index()
    
    # Take mean across dates in a month
    engage_state = engage_state.groupby(["state", "month"])[c["col_name"]].mean().round(2).reset_index()
    
    # plot
    plt.figure(figsize=(15, 6))
    sns.lineplot(data=engage_state, x="month", y=c["col_name"], hue="state")
    plt.title(c["title"])
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.ylabel(c["ylabel"], fontdict={"fontsize": 12})
    plt.show()

We continue to see that digital engagement drops during the summer.

This time it is difficult to see when mean daily page-load events peak. Its "peak" ranges include Feb-May, and Sept-Oct.

This is much clearer for the average percentage of students that have at least one-page load. The number peaks in April and October.

## Correlation between demographic features and digital activity

In [None]:
engage_idx_across_prod = engage_dist_cop.groupby(["district", "month", "time"])[["engagement_index"]].mean().reset_index()
engage_idx_across_prod = engage_idx_across_prod.groupby(["district", "month"])[["engagement_index"]].mean().reset_index()
engage_idx_across_prod = engage_idx_across_prod.groupby(["district"])[["engagement_index"]].mean().reset_index()

engage_idx_annual_full = pd.merge(left=engage_idx_across_prod, 
                                  right=districts_encode, 
                                  left_on="district", 
                                  right_on="district_id")

engage_corr = engage_idx_annual_full[engage_idx_annual_full.columns[1:]].corr()["engagement_index"][:].round(2)

plt.figure(figsize=(15, 8))
bar = sns.barplot(x=engage_corr.index, y=engage_corr.values)
plt.title("Engagement index correlation with other features")
plt.bar_label(bar.containers[0], padding=5)
plt.show()

In [None]:
pct_access_across_prod = engage_dist_cop.groupby(["district", "month", "time"])[["pct_access"]].max().reset_index()
pct_access_across_prod = pct_access_across_prod.groupby(["district", "month"])[["pct_access"]].mean().reset_index()
pct_access_across_prod = pct_access_across_prod.groupby(["district"])[["pct_access"]].mean().reset_index()

pct_access_annual_full = pd.merge(left=pct_access_across_prod, 
                                  right=districts_encode, 
                                  left_on="district", 
                                  right_on="district_id")

pct_access_corr = pct_access_annual_full[pct_access_annual_full.columns[1:]].corr()["pct_access"][:].round(2)

plt.figure(figsize=(15, 8))
bar = sns.barplot(x=pct_access_corr.index, y=pct_access_corr.values)
plt.title("Percentage of access correlation with other features")
plt.bar_label(bar.containers[0], padding=5)
plt.show()

Demographic features of districts, including type of locale, percentage of black/hispanic, and percentage of free/reduced have almost nothing to do with digital engagement in the districts. We might say that the per-pupil expenditure have a low correlation with digital engagement 

# Further research

1. If factors mentioned above do not affect digital engagement that much, than what factor(s) does? 
2. While each of these factors **alone** do not correlate with digital engagement indicators such as engagement index and percentage of access, is there any possibility that the interaction between these features do?