In [None]:
!pip install pdpipe

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
import pdpipe as pdp
from typing import Tuple, List, Dict
import glob

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

import category_encoders as ce

In [None]:
# read data
in_kaggle = True


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        districts_info_path = '../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv'
        products_info_path = '../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv'
        engagements_path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'
    else:
        # running locally
        districts_info_path = 'data/districts_info.csv'
        products_info_path = 'data/products_info.csv'
        engagements_path = 'data/engagement_data'

    return districts_info_path, products_info_path, engagements_path

# set the size of the geo bubble
def set_size(value):
    '''
    Takes the numeric value of a parameter to visualize on a map (Plotly Geo-Scatter plot)
    Returns a number to indicate the size of a bubble for a country which numeric attribute value 
    was supplied as an input
    '''
    result = np.log(1+value/100)
    if result < 0:
        result = 0.001
    return result

# Reading and Pre-Processing the Data

In [None]:
districts_info_path, products_info_path, engagements_path = get_data_file_path(in_kaggle)

We are going to read districsts data first. For the purpose of the current analysis, we are not going to preprocess this data file any further.

In [None]:
districts_df = pd.read_csv(districts_info_path)
districts_df.head()

Now we are going to read the data about e-learning software products. As a part of it, we will have to rename one of the columns in this data set (from *'LP ID'* to *'lp_id'*) for ease of merging with other datasets needed in this analysis, down the road.

For the purpose of the current analysis, we are not going to preprocess this data file any further.

In [None]:
products_df = pd.read_csv(products_info_path)
products_df.rename(columns = {'LP ID': 'lp_id'}, inplace = True)
products_df.head()

Finally, we embark on loading the engagement data. We will combine the individual school district charts into a single dataframe on the fly as well as add a new feature (*district_id*) to the combined dataframe with the engagement data.

In [None]:
# read engagement data files
all_engagement_files = glob.glob(engagements_path + "/*.csv")

li = []

for filename in all_engagement_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    # add district_id from the data file name
    
    df["district_id"] = filename.replace("\\", "/").split("/")[-1].split(".")[0]
    li.append(df)

engagements_df = pd.concat(li, axis=0, ignore_index=True)

engagements_df.head()

In [None]:
# missing data: engagements_df

total = engagements_df.isnull().sum().sort_values(ascending=False)
percent = (engagements_df.isnull().sum()/engagements_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

As we can see, there is a tiny fraction of the engagement records without *lp_id* recorded. We will have to drop such obervations since there is no way to map them to any software product listed in products_df.

For *pct_access* and *engagement_index*, we can interprete 'NaN' values as 0.00, based on the definition of the respective attributes.

**Note:** Just for the convenience reasons, below is the refresher on how *pct_access* and *engagement_index* are defined across this project

- *pct_access* - Percentage of students in the district have at least one page-load event of a given product and on a given day
- *engagement_index* - Total page-load events per one thousand students of a given product and on a given day


In [None]:
engagements_df = engagements_df.drop(engagements_df.loc[engagements_df['lp_id'].isnull()].index)
engagements_df = engagements_df.fillna(0.0)

After handling missing data, we will cast a couple of columns in the unified engagements dataframe (*lp_id*, *district_id*) to int. It will be helpful down the road as we are going to merge engagements data with district and product information.

**Note:** We do not convert *time* attribute to *datetime* yet as its string representation will be useful in building the animated geoscatter plots (see below).

In [None]:
# cast lp_id and district_id to int, to enable merging with the products and districts info down the road
engagements_df["lp_id"] = engagements_df["lp_id"].astype(int)
engagements_df["district_id"] = engagements_df["district_id"].astype(int)
#engagements_df["time"] = pd.to_datetime(engagements_df["time"])
engagements_df.tail()

As a final step, we are ready to combine engagement, district, and product information into a single dataframe. Such a dataframe will then be used as a foundation for futher analytical and EDA activities.

In [None]:
# merge districts and products
result_df = pd.merge(engagements_df, districts_df, on="district_id")
result_df = pd.merge(result_df, products_df, on="lp_id")

In [None]:
result_df.head()

The important notes about the data are listed below

- the observations for engagement are provided for the period of time from Jan 1, 2020 through Dec 31, 2021 inclusive
- only a fraction of the US states is represented in the dataset provided for this project (it is assumed to be the concious decision of the contest organizers)

# Digital Learning Platform Patterns

We are going to focus on how Covid-19 and correlated lockdown actions impacted the use of *Digital Learning Platforms* across the selective school districts represented in the datasets for this project.

## Let's Engage Despite the Covid-19!

As a first step, we will create a separate dataframe where we filter the data for *Digital Learning Platforms* only. After it, we will aggregate the data by observation date (*time* attribute) and the US state (*state*) attribute.

In [None]:
agg_digi_learn_df = result_df[result_df["Primary Essential Function"] == 'LC - Digital Learning Platforms']
agg_engagement_data = agg_digi_learn_df.groupby(["state", "time"],as_index=False)["engagement_index"].sum().reset_index()
agg_engagement_data.head(10)

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

pipeline = pdp.PdPipeline([
    pdp.ApplyByCols('engagement_index', set_size, 'size', drop=False),
    pdp.MapColVals('state', us_state_abbrev)
])

agg_engagement_data = pipeline.apply(agg_engagement_data)

agg_engagement_data.fillna(0, inplace=True)

agg_engagement_data = agg_engagement_data.sort_values(by='time', ascending=True)
agg_engagement_data.tail()

Now we are ready to unlock the power of visualization with the animanted geo scatter plot

In [None]:
fig = px.scatter_geo(
    agg_engagement_data, locations="state", locationmode='USA-states',
    scope="usa",
    color="engagement_index", 
    size='size', hover_name="state", 
    range_color= [0, 100000], 
    projection="albers usa", animation_frame="time", 
    title='Engagement Index: LC - Digital Learning Platforms', 
    color_continuous_scale="portland")

fig.show()