In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

### Load and Review Data
Ref Code: https://www.kaggle.com/ruchi798/covid-19-impact-on-digital-learning-eda-w-b

In [None]:
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

In [None]:
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df[engagement_df.lp_id==92844]

### Check Missing Values:

In [None]:
msno.bar(products_df,color='#7209b7', sort="ascending", figsize=(10,5), fontsize=12)
plt.title('product_df missing value')
plt.show()

msno.bar(districts_df,color='#f72585', sort="ascending", figsize=(10,5), fontsize=12)
plt.title('districts_df missing value')
plt.show()

msno.bar(engagement_df,color='#4895ef', sort="ascending", figsize=(10,5), fontsize=12)
plt.title('engagement_df missing value')
plt.show()

### Distribution on engagement index and pct access mean per lp id
From the following calculation, 98% lp id has average access pct less than 0.5 and 91% lp id as average engagement index less than 20.

In [None]:
mean_smr = engagement_df.groupby('lp_id').agg('mean').reset_index().sort_values(by='pct_access', ascending = False)
mean_smr.lp_id = pd.Categorical(mean_smr.lp_id)
print('Total LP ID:', len(mean_smr))
print('Total LP ID with pct_access <= 0.5:', "{0:.0%}".format(len(mean_smr[mean_smr.pct_access<=0.5])/len(mean_smr)))
print('Total LP ID with engagement index <= 20:', "{0:.0%}".format(len(mean_smr[mean_smr.engagement_index<=20])/len(mean_smr)))
mean_smr.head(3)

In [None]:
import plotly.express as px

fig = px.histogram(mean_smr, x="pct_access", title='Pct Access Distribution')
fig.show()

fig = px.histogram(mean_smr, x="engagement_index", title='engagement index Distribution')
fig.show()

fig = px.histogram(mean_smr[mean_smr.pct_access<=0.5], x="pct_access", title='Pct Access (<=0.5) Distribution')
fig.show()

fig = px.histogram(mean_smr[mean_smr.engagement_index<=20], x="engagement_index", title='engagement index (<=20) Distribution')
fig.show()

### Dataset Exploration

In [None]:
def aggregation_by_group(data, group_var, id_var, agg='count', top = 10):
    keep_data = data.copy()
    if isinstance(group_var, list) == False:
        keep_data[group_var] = keep_data[group_var].fillna('NULL')  
        keep_data = keep_data[[id_var, group_var]].groupby(group_var).agg(agg).reset_index().sort_values(by=id_var, ascending = False)
        keep_data = keep_data.rename(columns={id_var: agg}).reset_index(drop = True)
    if isinstance(group_var, list) == True:
        keep_var = group_var + [id_var] 
        keep_data = keep_data[keep_var].groupby(group_var).agg(agg).reset_index().sort_values(by=id_var, ascending = False)
        keep_data = keep_data.rename(columns={id_var: agg}).reset_index(drop = True)
    if top == 0:
        return keep_data
    else:
        return keep_data.head(top)
    
pal1 = ["#fec5bb","#fcd5ce","#fae1dd","#f8edeb","#e8e8e4","#d8e2dc","#ece4db","#ffe5d9","#ffd7ba","#fec89a"]
pal2 = ["#ffcbf2","#f3c4fb","#ecbcfd","#e5b3fe","#e2afff","#deaaff","#d8bbff","#d0d1ff","#c8e7ff","#c0fdff"]
pal3 = ["#d6d2d2","#f1e4f3","#f4bbd3","#f686bd","#fe5d9f"]
pal4 = ["#a09abc","#b6a6ca","#d5cfe1","#e1dee9","#d4bebe"]

In [None]:
state_smr = aggregation_by_group(districts_df, group_var = 'state', id_var = 'district_id', agg='count', top = 0)
plt.figure(figsize=(16, 10))
ax = sns.barplot(y="state" 
                 , x="count" 
                 , data=state_smr
                 , palette='YlOrBr')

In [None]:
locale_smr = aggregation_by_group(districts_df, group_var = 'locale', id_var = 'district_id', agg='count', top = 0)
fig = px.pie(locale_smr, values='count', names='locale', title='Population of locale')
fig.show()

In [None]:
provider_smr = aggregation_by_group(products_df, group_var = 'Provider/Company Name', id_var = 'LP ID', agg='count', top = 10)
ax = sns.barplot(y="Provider/Company Name" 
                 , x="count" 
                 , data=provider_smr
                 , palette=pal2)

In [None]:
sector_smr = aggregation_by_group(products_df, group_var = 'Sector(s)', id_var = 'LP ID', agg='count', top = 10)
fig = px.pie(sector_smr, values='count', names='Sector(s)', title='Population of Sectors')
fig.show()

In [None]:
function_smr = aggregation_by_group(products_df, group_var = 'Primary Essential Function', id_var = 'LP ID', agg='count', top = 20)
plt.figure(figsize=(16, 10))
ax = sns.barplot(y="Primary Essential Function" 
                 , x="count" 
                 , data=function_smr
                 , palette=pal2)

### Merge Data

In [None]:
print(products_df['LP ID'].nunique())
print(engagement_df.lp_id.nunique())
print(engagement_df['district_id'].nunique())
print(districts_df['district_id'].nunique())

In [None]:
products_engagement_data = pd.merge(products_df, engagement_df, left_on='LP ID', right_on='lp_id')
products_engagement_data.head()

In [None]:
product_pctaccess_smr = aggregation_by_group(products_engagement_data, group_var = 'Product Name', id_var = 'pct_access', agg='mean', top = 10)
ax = sns.barplot(y="Product Name" 
                 , x="mean" 
                 , data=product_pctaccess_smr
                 , palette=pal2)

In [None]:
product_engage_smr = aggregation_by_group(products_engagement_data, group_var = 'Product Name', id_var = 'engagement_index', agg='mean', top = 10)
ax = sns.barplot(y="Product Name" 
                 , x="mean" 
                 , data=product_engage_smr
                 , palette=pal2)

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
engagement_df["district_id"] = engagement_df["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_df, engagement_df, left_on='district_id', right_on='district_id')
districts_engagement_data['state_code'] = districts_engagement_data['state'].map(us_state_abbrev)
districts_engagement_data.head()

In [None]:
print('products_engagement_data info:')
print("data size: ",len(products_engagement_data))
print('len of unique lp id',products_engagement_data['LP ID'].nunique())
print('len of unique district id',products_engagement_data['district_id'].nunique())
print()
print('districts_engagement_data info:')
print("data size: ",len(districts_engagement_data))
print('len of unique lp id',districts_engagement_data['lp_id'].nunique())
print('len of unique district id',districts_engagement_data['district_id'].nunique())


In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
dr = pd.date_range(start='2020-01-01', end='2020-12-31')
cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
holidays

In [None]:
pct_accs_avg_state_smr = aggregation_by_group(districts_engagement_data, group_var = ['state','time'], id_var = 'pct_access', agg='count', top = 0).sort_values(by=['state','time'])
pct_accs_avg_state_smr['time'] = pct_accs_avg_state_smr['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
pct_accs_avg_state_smr['weekday'] = pct_accs_avg_state_smr.time.apply(lambda x: (x.weekday()<5) & (x not in holidays))

In [None]:
fig = px.line(pct_accs_avg_state_smr[pct_accs_avg_state_smr['weekday']==True], x="time", y="count", color='state')
fig.show()

In [None]:
pct_accs_avg_locale_smr = aggregation_by_group(districts_engagement_data, group_var = ['locale','time'], id_var = 'pct_access', agg='count', top = 0).sort_values(by=['locale','time'])
pct_accs_avg_locale_smr['time'] = pct_accs_avg_locale_smr['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
pct_accs_avg_locale_smr['weekday'] = pct_accs_avg_locale_smr.time.apply(lambda x: (x.weekday()<5) & (x not in holidays))
fig = px.line(pct_accs_avg_locale_smr[pct_accs_avg_locale_smr['weekday']==True], x="time", y="count", color='locale')
fig.show()

### Map Plot: 
https://plotly.com/python/choropleth-maps/

In [None]:
import plotly.express as px
def dynamic_choro_map(data, iso_code, country, feature, date_var, title, animation = True):
    """
    :param data: dataset
    :param iso_code: country's iso code variable name
    :param country: country variable name
    :param feature: feature need to be shown in the plot
    :param date: date (object)
    :param title: title
    :return:
    """
    df = data[[iso_code, feature, date_var, country]]
    df = df.sort_values(date_var, ascending = True)
    df['date'] = df[date_var].apply(lambda x: x.strftime('%Y-%m-%d'))
    color_range = int(df[feature].quantile(0.95))
    if animation == True:
        fig = px.choropleth(
            df,                            # Input Dataframe
            locations=iso_code,           # identify country code column
            color=feature,                     # identify representing column
            hover_name=country,              # identify hover name
            animation_frame='date',
            color_continuous_scale= 'viridis',
            range_color=[0,color_range],
            locationmode="USA-states",
            scope="usa",
            title='<span style="font-size:36px; font-family:Times New Roman">'+title
        )             
    else:
        fig = px.choropleth(
            df,                            # Input Dataframe
            locations=iso_code,           # identify country code column
            color=feature,                     # identify representing column
            hover_name=country,              # identify hover name
            color_continuous_scale= 'viridis',
            range_color=[0,color_range],
            locationmode="USA-states",
            scope="usa",
            title='<span style="font-size:36px; font-family:Times New Roman">'+title
        )           
    fig.show()

In [None]:
pct_accs_avg_state_smr = aggregation_by_group(districts_engagement_data, group_var = ['state_code','state','time'], id_var = 'pct_access', agg='mean', top = 0).sort_values(by=['state','time'])
pct_accs_avg_state_smr['time'] = pct_accs_avg_state_smr['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
pct_accs_avg_state_smr['weekday'] = pct_accs_avg_state_smr.time.apply(lambda x: (x.weekday()<5) & (x not in holidays))
pct_accs_avg_state_smr.head(3)

In [None]:
access_wide = pct_accs_avg_state_smr.pivot(index = 'time', columns = 'state')['mean'].T
access_wide = access_wide.head(8)
x = access_wide.columns
y = access_wide.values.tolist()
labels = access_wide.index.tolist()

color = ['#FFE4E1','#EED5D2','#CDB7B5','#8B7D7B','#FFE4B5','#FFDEAD','#EECFA1','#CDB38B','#8B795E']

fig, ax = plt.subplots(figsize=(20, 14))
ax.stackplot(x, 
             y, 
             labels=labels, 
#              colors= color, 
             baseline="sym")
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
dynamic_choro_map(pct_accs_avg_state_smr[pct_accs_avg_state_smr.weekday==True], 
                  'state_code', 
                  'state', 
                  'mean', 
                  'time', 
                  'Average Access Rate Change over Time by States',
                  animation = True)

In [None]:
engage_avg_state_smr = aggregation_by_group(districts_engagement_data, group_var = ['state_code','state','time'], id_var = 'engagement_index', agg='mean', top = 0).sort_values(by=['state','time'])
engage_avg_state_smr['time'] = engage_avg_state_smr['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
engage_avg_state_smr['weekday'] = engage_avg_state_smr.time.apply(lambda x: (x.weekday()<5) & (x not in holidays))
engage_avg_state_smr.head(3)

In [None]:
dynamic_choro_map(engage_avg_state_smr[engage_avg_state_smr.weekday==True], 
                  'state_code', 
                  'state', 
                  'mean', 
                  'time', 
                  'Average Engagement Index Change over Time by States',
                  animation = True)