# Covid Digital Learning Data Exploration
_By Nick Brooks, September 2021_

In [None]:
!pip install watermark
!pip install nicaviz

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nicaviz
import seaborn as sns
import time
import matplotlib.pyplot as plt
import itertools
import glob
import re
import math

sns.set_style("whitegrid")

%load_ext watermark
%watermark 

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

notebookstart = time.time()
%watermark --iversions

In [None]:
# Helpers
regex = r"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?"
def extract_fractions(value):
    if isinstance(value, str) == False:
        return [0,0]
    try:
        compile_regex = re.compile(regex)
        processed = compile_regex.findall(value)
        return processed 
    except Exception as e:
        print(f'Error: {value}, {e}')
        return [0,0] 

In [None]:
districts_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

ratio_cols = [
    'pct_black/hispanic',
    'pct_free/reduced',
    'county_connections_ratio',
    'pp_total_raw']

for ratio_col in ratio_cols:
    districts_info[ratio_col] = districts_info[ratio_col].apply(extract_fractions)
    tmp = pd.DataFrame(districts_info[ratio_col].values.tolist(), columns = [f'{ratio_col}_{i}' for i in [1,2]])
    districts_info = pd.concat([districts_info, tmp], axis = 1)
    districts_info[ratio_col] = districts_info[ratio_col].astype(str)
 
display(districts_info.head())
display(districts_info.nica.categorical_describe())

In [None]:
categoricals = [
    'state',
    'locale',
    'pct_black/hispanic',
    'pct_free/reduced',
    'county_connections_ratio',
    'pp_total_raw']
continuous = []

districts_info.nica.mass_plot(
    plt_set = categoricals,
    columns = 2,
    plottype = "countplot")

In [None]:
products_info = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")\
    .rename(columns = {'LP ID': "lp_id"})

display(products_info.head())
display(products_info.nica.categorical_describe())

In [None]:
categoricals = [
    'Provider/Company Name',
    'Sector(s)',
    'Primary Essential Function'
]
continuous = []

products_info.nica.mass_plot(
    plt_set = categoricals,
    columns = 1,
    figsize = [15,12],
    plottype = "countplot")

In [None]:
# yoink https://www.kaggle.com/ruchi798/covid-19-impact-on-digital-learning-eda-w-b
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df['district_id'] = engagement_df['district_id'].astype(int)

In [None]:
engagement_df.head()

In [None]:
with nicaviz.timer("Join"):
    df = pd.merge(engagement_df, districts_info, on='district_id', how='left')
    df = pd.merge(df, products_info, on="lp_id", how="left")
    df['time'] = pd.to_datetime(df['time'])

# with nicaviz.timer("Reduce Memory"):
#     df, na_list = nicaviz.reduce_mem_usage(df, nan_fill=0)
    
# del engagement_df; del districts_info

In [None]:
display(df.head())
display(df.nica.categorical_describe())

In [None]:
categoricals = [
    'Provider/Company Name',
    'Sector(s)',
    "locale",
    'Primary Essential Function',
    "pct_black/hispanic",
    "pct_free/reduced",
    "county_connections_ratio",
    "pp_total_raw"
]
continuous = ['pct_access', 'engagement_index']

timevars = ['time']

In [None]:
# clip continuous vars
for col in ['pct_access', 'engagement_index']:
    df[col] = df[col].clip(0, df[col].quantile(.95))

## TimeSeries

In [None]:
with nicaviz.timer("Timeserie Resample Plot"):
    df.nica.mass_plot(
        plt_set= ["time"],
        plottype = "ts_resample",
        resample=True,
        resample_interval="1W"
    )

In [None]:
with nicaviz.timer("Timeserie Rolling Plot"):
    df.sample(50000).nica.mass_plot(
        plt_set= continuous,
        plottype = "ts_rolling",
        x_var="time",
        rolling=True,
        r=30
    )

## Continuous Variables

In [None]:
with nicaviz.timer("Boxplots"):
    df.nica.mass_plot(
        plt_set= continuous,
        plottype = "boxplot")

In [None]:
categoricals = [
    'Sector(s)',
    "locale",
    "pct_black/hispanic",
    "pct_free/reduced",
    "county_connections_ratio",
    "pp_total_raw"
]

with nicaviz.timer("Small Hue Boxplots"):
    for hue_col in categoricals:
        df.nica.mass_plot(
            plt_set= continuous,
            hue = hue_col,
            columns = 2,
            plottype = "boxplot")

In [None]:
with nicaviz.timer("Primary Essential Function Hue Boxplots"):
    df.nica.mass_plot(
        plt_set= continuous,
        hue = "Primary Essential Function",
        columns = 1,
        figsize = [14,18],
        plottype = "boxplot")

In [None]:
with nicaviz.timer("Provider/Company Name Hue Boxplots"):
    df.nica.mass_plot(
        plt_set= continuous,
        hue = "Provider/Company Name",
        columns = 1,
        figsize = [14,80],
        plottype = "boxplot")

In [None]:
categoricals = [
    'Sector(s)',
    "locale",
    "pct_black/hispanic",
    "pct_free/reduced",
    "county_connections_ratio",
    "pp_total_raw"
]

with nicaviz.timer("Pivot Heatmap Plot"):
    for cont_col in continuous:
        df.nica.pivot_plots(
            categoricalcols = categoricals,
            valuecol = cont_col,
            aggfunc = np.mean,
            figsize = [15,35],
            columns = 2)

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))