In [None]:
import numpy as np
import pandas as pd
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px 
import plotly.tools as tls 
import plotly.figure_factory as ff 


import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames[0:1]:
#        print(os.path.join(dirname, filename))

# Data Cleaning on All Dataset

## Product and District Data

In [None]:
district_df = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products_df = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
district_df.head(3)

In [None]:
products_df.head(3)

### Data Types Checking

### District Data
#### Finding
- All missing data in state will have ```state```, ```locale```, ```pct_black/hispanic```, ```pct_free/reduced```, ```country_connections_ratio``` and ```pp_total_raw```
<br/>
<br/>

#### Cleaning Requirement
- Consider to drop out all ```district_id``` in the row that contain missing value in ```state```
- Correct format of columns ```pct_black/hispanic```, ```pct_free/reduced``` , ```county_connections_ratio```, ```pp_total_raw``` by spliting to min, max and mean

In [None]:
# Check missing value on both dataset
display(district_df.info(),district_df.isna().sum()/len(district_df)*100)

In [None]:
# Visuaize occurence of missing value in dataset
msno.matrix(district_df.sort_values(by = 'state', ascending = True))

In [None]:
# Drop rows where '''state''' column is null
district_df.dropna(subset = ['state'], how = 'any', inplace = True)

In [None]:
'''
Reformat on 4 columns by following steps
- Remove the bracket
- Split number by comma
- Get max and min number by position after split
- Get average from max and min value
'''

reformat_colname = ['pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio', 'pp_total_raw']

for colname in reformat_colname:
    district_df[colname] = district_df[colname].str.replace('[', '', regex = False)
    district_df[colname + '_min'] = district_df[colname].str.split(',', expand = True)[0].astype('float')
    district_df[colname + '_max'] = district_df[colname].str.split(',', expand = True)[1].astype('float')
    district_df[colname + '_avg'] = (district_df[colname + '_min']  + district_df[colname + '_max'])/2
    district_df.drop(columns = colname, inplace = True)
    
district_df['state_local'] = district_df['state'] + ':' + district_df['locale']
district_df['district_id'] = district_df['district_id'].astype('string')

In [None]:
district_df.head(5)

### Product Data
#### Finding
- Missing value on ```Sector(s)``` and ```Primary Essential Function``` are due to duplicateion in ```Product Name``` and ```URL```
- Both ```Sector(s)``` and ```Primary Essential Function```  will be missed together

#### Cleaning Requirement
- Manual fill up ```Sector(s)``` and ```Primary Essential Function``` from other ```Product Name```  in within group of ```Provider/Company Name``` 
- Replace '-' in ```Primary Essential Function``` for spliting
- Split ```Primary Essential Function``` to get product labels and sub-cateogires
- Split ```Sector(s)``` to be columns-wise and turn it to boolean

In [None]:
# Check missing value on dataset
display(products_df.info(),products_df.isna().sum()/len(products_df)*100)

In [None]:
msno.matrix(products_df.sort_values(by = 'Sector(s)', ascending = False))

In [None]:
# Create product count by provider/company name for fill the missing value
product_count = products_df.groupby(['Provider/Company Name']).agg({'Product Name' : 'nunique',
                                                                    'Sector(s)' : lambda x : x.isnull().sum()}).reset_index()

potential_product_dup = product_count[(product_count['Product Name']  > product_count['Sector(s)']) &
                                      (product_count['Sector(s)'] > 0)].rename(columns = {"Product Name":"Unique_Product Name",
                                                                                          "Sectors(s)" : "Missing_Sector(s)"})

potential_product_dup

In [None]:
# Example of missing Sectors by CompanyName
products_df[products_df['Provider/Company Name'] == 'Adobe Inc.']

In [None]:
#Adobe Inc.	
products_df.at[305, 'Sector(s)'] = products_df.iloc[213]['Sector(s)']
products_df.at[305, 'Primary Essential Function'] = products_df.iloc[213]['Primary Essential Function']

#ClassDojo, Inc.	
products_df.at[237, 'Sector(s)'] = products_df.iloc[19]['Sector(s)']
products_df.at[237, 'Primary Essential Function'] = products_df.iloc[19]['Primary Essential Function']

#Code.org
products_df.at[356, 'Sector(s)'] = products_df.iloc[22]['Sector(s)']
products_df.at[356, 'Primary Essential Function'] = products_df.iloc[22]['Primary Essential Function']

#EDpuzzle Inc.
products_df.at[370, 'Sector(s)'] = products_df.iloc[31]['Sector(s)']
products_df.at[370, 'Primary Essential Function'] = products_df.iloc[31]['Primary Essential Function']

#Grammarly
products_df.at[314, 'Sector(s)'] = products_df.iloc[57]['Sector(s)']
products_df.at[314, 'Primary Essential Function'] = products_df.iloc[57]['Primary Essential Function']

#IXL Learning
products_df.at[61, 'Sector(s)'] = products_df.iloc[60]['Sector(s)']
products_df.at[61, 'Primary Essential Function'] = products_df.iloc[60]['Primary Essential Function']

#Microsoft
products_df.at[183, 'Sector(s)'] = products_df.iloc[216]['Sector(s)']
products_df.at[183, 'Primary Essential Function'] = products_df.iloc[216]['Primary Essential Function']

#Technological Solutions, Inc. (TSI)
products_df.at[352, 'Sector(s)'] = products_df.iloc[301]['Sector(s)']
products_df.at[352, 'Primary Essential Function'] = products_df.iloc[301]['Primary Essential Function']

In [None]:
# Helper Functions for cleaning sub-strip in list
def clean_sectors_list(sector_list: list) -> list:
    try:
        sub_sector = [sector.strip() for sector in sector_list]
        return sub_sector
    except:
        return ['Unknow']
        
        
def clean_sub_cat(item_list: list) -> list:
    try:
        sub_cat = item_list[1:]
        sub_cat = [item.strip() for item in sub_cat]
        return sub_cat
    except:
        return ['Unknow']

In [None]:
# Sectors Column
products_df['Sector(s)_list'] = products_df['Sector(s)'].str.split(';')
products_df['Sector(s)_list'] = products_df['Sector(s)_list'].apply(lambda x : clean_sectors_list(x))

sector = products_df['Sector(s)_list'].explode()
products_df= products_df.join(pd.crosstab(sector.index, sector))
products_df.rename(columns = {'Corporate' : 'Sector_Corporate',
                             'Higher Ed' : 'Sector_Higher Ed',
                             'PreK-12' : 'Sector_PreK-12',
                             'Unknow' : 'Sector_Unknow'}, inplace = True)

In [None]:
# Category Column
products_df['Product Category'] = products_df['Primary Essential Function'].str.split('-', expand = True)[0]
products_df['Primary Essential Function_list'] = products_df['Primary Essential Function'].str.split('-')
products_df['Product Sub-Cat_list'] = products_df['Primary Essential Function_list'].apply(lambda x: clean_sub_cat(x))

subcat = products_df['Product Sub-Cat_list'].explode()
products_df = products_df.join(pd.crosstab(subcat.index, subcat))

products_df.head(5)

### Engagement Data

In [None]:
engauge_path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
engauge_data_list = os.listdir(engauge_path)

total_engauge_df = pd.DataFrame()
for file in engauge_data_list:
    single_city_data = pd.read_csv(engauge_path + '/' +file)
    single_city_data['district_id'] = str(file.replace('.csv', ''))
    total_engauge_df = total_engauge_df.append(single_city_data)

In [None]:
total_engauge_df.head(5)

In [None]:
# Check mising value on all columns
''' Consider to drop all missing lp_id and pct_access due to low percentage of missing'''
total_engauge_df.isna().sum()/len(total_engauge_df)*100

In [None]:
'''
Clean Up #1
- Drop missing value on columns lp_id, pct_access
- Correct format of 'lp_id'
- Correct datatyppe of 'lp_id', '
- Conver 'time' to date_time data type
'''
total_engauge_df = total_engauge_df.dropna(subset = ['lp_id', 'pct_access'])
total_engauge_df['lp_id'] = total_engauge_df['lp_id'].astype('str').str.replace('.0', '', regex = False)
total_engauge_df['time'] = pd.to_datetime(total_engauge_df['time'], format = '%Y-%m-%d')

In [None]:
total_engauge_df['month'] = total_engauge_df['time'].dt.month
total_engauge_df['year'] = total_engauge_df['time'].dt.year
total_engauge_df['year'] = total_engauge_df['time'].dt.quarter
total_engauge_df['dateofweek_quarter'] = total_engauge_df['time'].dt.dayofweek # Monday = 0, Sunday = 6
total_engauge_df['dateofweek_name'] = total_engauge_df['time'].dt.day_name()

In [None]:
# Extract datetime features
total_engauge_df['month'] = total_engauge_df['time'].dt.month
total_engauge_df['year'] = total_engauge_df['time'].dt.year
total_engauge_df['quarter'] = total_engauge_df['time'].dt.quarter
total_engauge_df['dateofweek_quarter'] = total_engauge_df['time'].dt.dayofweek # Monday = 0, Sunday = 6
total_engauge_df['dateofweek_name'] = total_engauge_df['time'].dt.day_name()
total_engauge_df['year_month'] = total_engauge_df['time'].dt.to_period('M')

total_engauge_df['district_product'] = total_engauge_df['district_id'] + '_' + total_engauge_df['lp_id']

# Exploratory Data Analysis

## Overall Trend Over 2020

In [None]:
overall_engage = total_engauge_df.groupby(['month']).agg({'engagement_index' : 'mean'}).reset_index()
overall_access = total_engauge_df.groupby(['month']).agg({'pct_access' : 'mean'}).reset_index()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=overall_engage['month'], y=overall_engage['engagement_index'], name="Engaugement data"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=overall_access['month'], y=overall_access['pct_access'], name="Access Pct data"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Overall of engaugement index and access percentage over 2020"
)

# Set x-axis title
fig.update_xaxes(title_text="Month")

# Set y-axes titles
fig.update_yaxes(title_text="Enagement Index", secondary_y=False)
fig.update_yaxes(title_text="Access Percentage", secondary_y=True)

fig.show()

In [None]:
covid_data = pd.read_csv('../input/covid-case-from-ourworldindata/owid-covid-data.csv')
us_data = covid_data[covid_data['location'] == 'United States'][['date','new_cases', 'new_vaccinations']]

us_data['date'] = pd.to_datetime(us_data['date'])

us_data['month'] = us_data['date'].dt.month
us_data['year'] = us_data['date'].dt.year
us_data['quarter'] = us_data['date'].dt.quarter

us_data_2020 = us_data[us_data['year'] == 2020]

us_data_2020monthly = us_data_2020.groupby(['month']).agg({'new_cases':'sum'}).reset_index()

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=overall_engage['month'], y=overall_engage['engagement_index'], name="Engaugement data"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x = us_data_2020monthly['month'], y = us_data_2020monthly['new_cases'], name = "COVID19 Case in USA"), secondary_y = True
)

# Add figure title
fig.update_layout(
    title_text="Overall of engaugement index and monthly COVID case over 2020"
)

# Set x-axis title
fig.update_xaxes(title_text="Month")

# Set y-axes titles
fig.update_yaxes(title_text="Enagement Index", secondary_y=False)
fig.update_yaxes(title_text="Monthly COVID case", secondary_y=True)

fig.show()

## Overall Trend by District and Locale

In [None]:
# Helper functions to reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
total_engauge_df = reduce_mem_usage(total_engauge_df)
district_df = reduce_mem_usage(district_df)

In [None]:
total_engauge_df = total_engauge_df.merge(district_df, on= 'district_id', how = 'left')

In [None]:
total_engauge_df.dropna(subset = ['state_local'], inplace = True)

In [None]:
px.line(total_engauge_df.groupby(['locale', 'month']).agg({'engagement_index' : 'mean','pct_access' : 'mean'}).reset_index(), 
                                    x = 'month', y = 'engagement_index', color = 'locale', title = 'Monthly Average Engaugement Index by Locale')

In [None]:
px.line(total_engauge_df.groupby(['locale', 'month']).agg({'engagement_index' : 'mean','pct_access' : 'mean'}).reset_index(), 
                                    x = 'month', y = 'pct_access', color = 'locale', title = 'Monthly Average Access Pct by Locale')

In [None]:
px.line(total_engauge_df.groupby(['state', 'month']).agg({'engagement_index' : 'mean','pct_access' : 'mean'}).reset_index(), 
                                    x = 'month', y = 'engagement_index', color = 'state', title = 'Monthly Average Engaugement Index by District')

In [None]:
px.line(total_engauge_df.groupby(['state', 'month']).agg({'engagement_index' : 'mean','pct_access' : 'mean'}).reset_index(), 
                                    x = 'month', y = 'engagement_index', color = 'state', title = 'Monthly Average Engaugement Index by District')