# COVID-19 Impact on Digital Learning

## Imports

In [None]:
import os
import sys

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandas_profiling import ProfileReport 

from IPython.display import Image
from wordcloud import WordCloud, STOPWORDS
from IPython.display import Markdown, display, Image, display_html

In [None]:
class DfOverview:
    """
        Give an overview for a given data frame, 
        like null persentage for each columns, 
        unique value percentage for each columns and more
    """

    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df

    def missing_value(self) -> None:
        nullSum = self.df.isna().sum()
        return [col for col in nullSum]

    def unique_values(self) -> None:
        return [self.getUniqueCount(column) for column in self.df]

    def percentage(self, list):
        return [str(round(((value / self.df.shape[0]) * 100), 2)) + '%' for value in list]

    def getOverview(self) -> None:

        _labels = [column for column in self.df]  # Only numeric columns
        _count = self.df.count().values
        _unique = [self.df[column].value_counts().shape[0] for column in self.df]
        _missing_values = self.missing_value()

        columns = [
            'label',
            'count',
            'none_count',
            'none_percentage',
            'unique_value_count',
            'unique_percentage',
            'dtype']
        data = zip(
            _labels,
            _count,
            _missing_values,
            self.percentage(_missing_values),
            _unique,
            self.percentage(_unique),
            self.df.dtypes
        )
        new_df = pd.DataFrame(data=data, columns=columns)
        new_df.set_index('label', inplace=True)
        new_df.sort_values(by=["none_count"], inplace=True)
        return new_df

In [None]:
def view_df(df, subset=[], color='#66F582'):
    df = df.reset_index()
    style = df.style.set_table_attributes("style='display:inline'").\
        bar(subset=subset, axis=1, color=color)\
        .format({"label": lambda x: x.upper()})\
        .set_properties(**{'background-color': 'white', 'color': 'black'})
    display_html(style._repr_html_(), raw=True)

## Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

files = []

for file in filenames:
    df = pd.read_csv(dirname + "/" + filenames[0], index_col = None, header = 0)
    district_id = file.split('.')[0]
    df['district_id'] = district_id
    files.append(df)
    
engagement = pd.concat(files)
engagement = engagement.reset_index(drop = True)
engagement['time'] = pd.to_datetime(engagement['time'])

districts_info = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products_info = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

## Preprocessing

### DISTRICTS

The districts file includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab:

- `distrist_id`
- `state`
- `locale`
- `pct_black/hispanic` - percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data.
- `pct_free/reduced` - percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data.
- `county_connections_ratio` - ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version).
- `pp_total_raw` - per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project.



In [None]:
profile = ProfileReport( districts_info, title='Pandas profiling report ' , html={'style':{'full_width':True}})
profile.to_notebook_iframe()

In [None]:
districts_info.head(5)

In [None]:
df_overview = DfOverview(districts_info.drop(columns=['district_id']))
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

### Lets handle the non values first

Dropping Districts with NaN States

In [None]:
districts_info = districts_info[districts_info.state.notna()].reset_index(drop=True)

Replace the other non values with the mode in local

In [None]:
def get_mode(df, state, locale, column):
    values = df[(df.locale == locale)]
    if(len(list(values[column].mode()))>0):
        return values[column].mode()[0]
    else:
        return None

In [None]:
def replace_with_mode(value, state, locale, column):
    if str(value)!= 'nan':
        return value
    else: 
        return get_mode(districts_info, state, locale, column)

In [None]:
districts_info['county_connections_ratio'] = districts_info.apply(lambda x: replace_with_mode(x['county_connections_ratio'], x['state'], x['locale'], "county_connections_ratio"), axis=1)
districts_info['pct_free/reduced'] = districts_info.apply(lambda x: replace_with_mode(x['pct_free/reduced'], x['state'], x['locale'], "pct_free/reduced"), axis=1)
districts_info['pp_total_raw'] = districts_info.apply(lambda x: replace_with_mode(x['pp_total_raw'], x['state'], x['locale'], "pp_total_raw"), axis=1)

In [None]:
df_overview = DfOverview(districts_info.drop(columns=['district_id']))
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

In [None]:
print(districts_info['state'].unique())

In [None]:
print(districts_info['locale'].unique())

In [None]:
print(districts_info['pct_black/hispanic'].unique())

In [None]:
def insert_average(x):
    first = x.split(" ")[0][1:-1]
    last = x.split(" ")[1][1:-1]
    if(last == ""):
        last = x.split(" ")[1][:-1]
    return (float(first)+float(last))/2

In [None]:
print(districts_info['pct_black/hispanic'].unique())

In [None]:
districts_info['pct_black/hispanic'] = districts_info['pct_black/hispanic'].apply(lambda x: insert_average(x))

In [None]:
print(districts_info['county_connections_ratio'].unique())

In [None]:
districts_info['county_connections_ratio'] = districts_info['county_connections_ratio'].apply(lambda x: insert_average(x))

In [None]:
print(districts_info['county_connections_ratio'].unique())

In [None]:
print(districts_info['pct_free/reduced'].unique())

In [None]:
districts_info['pct_free/reduced'] = districts_info['pct_free/reduced'].apply(lambda x: insert_average(x))

In [None]:
print(districts_info['pct_free/reduced'].unique())

In [None]:
print(districts_info['pp_total_raw'].unique())

In [None]:
districts_info['pp_total_raw'] = districts_info['pp_total_raw'].apply(lambda x: insert_average(x))

In [None]:
print(districts_info['pp_total_raw'].unique())

In [None]:
districts_info.head(10)

### PRODUCTS

The product file includes information about the characteristics of the top 372 products with most users in 2020:

- `LP ID` - the unique identifier of the product.
URL
- `Product Name`
- `Provider/Company Name`
- `Sector(s)` - sector of education where the product is used.
- `Primary Essential Function` - the basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled.

In [None]:
profile = ProfileReport( products_info, title='Pandas profiling report ' , html={'style':{'full_width':True}})
profile.to_notebook_iframe()

In [None]:
df_overview = DfOverview(products_info.drop(columns=['LP ID']))
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

In [None]:
products_info = products_info[products_info['Provider/Company Name'].notna()].reset_index(drop=True)

In [None]:
def get_mode(df, company, column):
    values = df[(df["Provider/Company Name"] == company)]
    if(len(list(values[column].mode()))>0):
        return values[column].mode()[0]
    else:
        return None

In [None]:
def replace_with_mode(value, company, column):
    if str(value)!= 'nan':
        return value
    else: 
        return get_mode(products_info, company, column)

In [None]:
products_info['Sector(s)'] = products_info.apply(lambda x: replace_with_mode(x['Sector(s)'], x['Provider/Company Name'], "Sector(s)"), axis=1)
products_info['Primary Essential Function'] = products_info.apply(lambda x: replace_with_mode(x['Primary Essential Function'], x['Provider/Company Name'], "Primary Essential Function"), axis=1)

In [None]:
products_info.dropna(inplace=True)

In [None]:
print(products_info['Sector(s)'].unique())

In [None]:
print(products_info['Primary Essential Function'].unique())

In [None]:
# Splitting up the Primary Essential Function

products_info['primary_function_main'] = products_info['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_info['primary_function_sub'] = products_info['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products_info['primary_function_sub'] = products_info['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_info.drop("Primary Essential Function", axis=1, inplace=True)

After preprocessing, we are left with a reduced districts_info dataframe with 176 districts and the product_info dataframe looks are follows:

In [None]:
df_overview = DfOverview(products_info.drop(columns=['LP ID']))
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

### ENGAGEMENT

The engagement file includes information about engagement of students with learning products in various school districts for the entire year 2020:

- `time` - date.
- `lp_id` - the unique identifier of the product.
- `pct_access` - percentage of students in the district have at least one page-load event of a given product and on a given day.
- `engagement_index` - total page-load events per one thousand students of a given product and on a given day.
- `district_id`

In [None]:
df_overview = DfOverview(engagement)
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

In [None]:
engagement['engagement_index'] = engagement['engagement_index'].fillna(0)

In [None]:
engagement['year'] = pd.DatetimeIndex(engagement['time']).year
engagement['month'] = pd.DatetimeIndex(engagement['time']).month
engagement['day'] = pd.DatetimeIndex(engagement['time']).day
engagement['DayOfWeek'] = engagement.time.dt.dayofweek
engagement['WeekOfYear'] = engagement.time.dt.weekofyear
engagement['Weekend'] = engagement['DayOfWeek'].apply(lambda x: 1 if x >= 6 else 0)
engagement['Weekday'] = engagement['DayOfWeek'].apply(lambda x: 1 if x < 6 else 0)

In [None]:
df_overview = DfOverview(engagement)
df_ = df_overview.getOverview()
view_df(df_, ["count", "none_count", "unique_value_count"])

## Exploration

### District

In [None]:
districts_info.head(5)

In [None]:
px.histogram(districts_info, x='state', color="locale").update_xaxes(categoryorder='total ascending')

In [None]:
df = px.data.tips()
fig = px.bar(districts_info, y="state", color='locale', orientation='h',
             title='Count of districts in the available States')
fig.update_xaxes(categoryorder = 'total ascending')
fig.show()

In [None]:
df = px.data.tips()
sunb_data = districts_info[['state', 'locale']]
sunb_data = sunb_data.groupby(['state', 'locale']).size().reset_index(name='count')
fig = px.sunburst(sunb_data, path=['state', 'locale'], values='count')
fig.show()

In [None]:
state_abb = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [None]:
districts_info['state_abb'] = districts_info['state'].map(state_abb)

fig = go.Figure()
layout = dict(
    title_text = "Count of districts in the available States",
    title_font = dict(
            family = "monospace",
            size = 25,
            color = "black"
            ),
    geo_scope = 'usa'
)

fig.add_trace(
    go.Choropleth(
        locations = districts_info['state_abb'].value_counts().to_frame().reset_index()['index'],
        zmax = 1,
        z = districts_info['state_abb'].value_counts().to_frame().reset_index()['state_abb'],
        locationmode = 'USA-states',
        marker_line_color = 'white',
        geo = 'geo',
        colorscale = "Reds", 
    )
)
            
fig.update_layout(layout)   
fig.show()

### Provide

In [None]:
products_info.head(5)

In [None]:
cloud = WordCloud(width=1080, height=270,background_color='white').generate(" ".join(products_info['Product Name'].astype(str)))
plt.figure(figsize=(22, 10))
plt.imshow(cloud)
plt.axis('off');

In [None]:
px.histogram(products_info, x='primary_function_sub', color="primary_function_main").update_xaxes(categoryorder='total ascending')

In [None]:
px.histogram(products_info, x='primary_function_sub', color="Sector(s)").update_xaxes(categoryorder='total ascending')

In [None]:
px.histogram(products_info, x='primary_function_main', color="primary_function_sub").update_xaxes(categoryorder='total ascending')

In [None]:
freq = products_info.groupby(['Provider/Company Name']).count()
freq.sort_values(by=['Product Name'], ascending=False )[:10]

In [None]:
df = px.data.tips()
sunb_data = products_info[['primary_function_main', 'Provider/Company Name', 'Sector(s)']]
sunb_data = sunb_data.dropna()
sunb_data = sunb_data.groupby(['primary_function_main','Provider/Company Name']).size().reset_index(name='count')
fig = px.sunburst(sunb_data, path=['primary_function_main','Provider/Company Name'], values='count')
fig.show()

### Engagement

In [None]:
engagement.head()

In [None]:
plt.figure(figsize=(15,12))
sns.set_style('whitegrid')
sns.stripplot(x="month", y="pct_access", data=engagement)
plt.show()

## Joining store and train datasets

In [None]:
districts_info

In [None]:
districts_info.info()

In [None]:
products_info = products_info.rename(columns={"LP ID": "lp_id"})
df = pd.merge(engagement, products_info, on='lp_id')
districts_info['district_id'] = districts_info['district_id'].astype(str)
df = pd.merge(df, districts_info, on='district_id')

df.head(5)

In [None]:
daily_trend = df.groupby(['day', 'Sector(s)']).agg({'engagement_index': 'mean', 'pct_access': 'mean'})
daily_trend = daily_trend.unstack().swaplevel(0, 1, 1).sort_index(1)

In [None]:
def plot_trend(df, columns, feature, title, x_label="", y_label="", labels=['']):
  plt.figure(figsize=(18, 6))
  for i in range(len(columns)):
    sns.lineplot(x=df.index, y=df[columns[i]][feature], label=labels[i])
  plt.title(title, fontsize=15, fontweight='bold')
  plt.ylabel(x_label, fontsize=14)
  plt.xlabel(y_label, fontsize=14)
  plt.show()

In [None]:
columns = df['Sector(s)'].unique()
plot_trend(daily_trend, columns, 'engagement_index', 'Average daily sales for 3 years', labels=columns)

In [None]:
plot_trend(daily_trend, columns, 'pct_access', 'Average daily sales for 3 years', labels=columns)

In [None]:
daily_trend = df.groupby(['day', 'primary_function_main']).agg({'engagement_index': 'mean', 'pct_access': 'mean'})
daily_trend = daily_trend.unstack().swaplevel(0, 1, 1).sort_index(1)

In [None]:
columns = df['primary_function_main'].unique()
plot_trend(daily_trend, columns, 'engagement_index', 'Average daily sales for 3 years', labels=columns)

In [None]:
monthly_trend = df.groupby(['month', 'primary_function_main']).agg({'engagement_index': 'mean', 'pct_access': 'mean'})
monthly_trend = monthly_trend.unstack().swaplevel(0, 1, 1).sort_index(1)

In [None]:
plot_trend(monthly_trend, columns, 'engagement_index', 'Average daily sales for 3 years', labels=columns)

In [None]:
plot_trend(monthly_trend, columns, 'pct_access', 'Average daily sales for 3 years', labels=columns)

In [None]:
monthly_trend = df.groupby(['time', 'primary_function_main']).agg({'engagement_index': 'mean', 'pct_access': 'mean'})
monthly_trend = monthly_trend.unstack().swaplevel(0, 1, 1).sort_index(1)