In [None]:
'''
Learn Covid Platform COVID-19 Impact
'''

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from statsmodels.tsa.statespace.sarimax import SARIMAX



plt.style.use('seaborn-whitegrid') #Seaborn style


In [None]:
# List all files available
districts = []

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data = os.path.join(dirname, filename)
        # print(os.path.join(dirname, filename))

# Districts Info
We dropped all the values with Nan from districts

In [None]:
# Districts Info 
#filtrar datos en base a district_info.csv
%time
districts_path = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv"
districts_info_df = pd.read_csv(districts_path)
print(districts_info_df.info())
print(districts_info_df.head())
print(districts_info_df.isna().sum())
print("==Delete all district that has state as NaN")
districts_info_df = districts_info_df[districts_info_df.state.notna()].reset_index(drop=True)
print(districts_info_df.isna().sum())
print(districts_info_df.head())
districts_info_df = pd.read_csv(districts_path)
print(districts_info_df.info())
print(districts_info_df.head())
print(districts_info_df.isna().sum())
print("==Delete all district that has state as NaN")
districts_info_df = districts_info_df[districts_info_df.state.notna()].reset_index(drop=True)
print(districts_info_df.isna().sum())
print(districts_info_df.head())

In [None]:
%time
products_path = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv"
products_info_df = pd.read_csv(products_path)
# Rename column "LP ID"
# prod_cols = products_info_df.columns.tolist()
# prod_cols[0] = "lp_id"
# products_info_df.columns=prod_cols
# products_info_df.head()

temp_sectors = products_info_df['Sector(s)'].str.get_dummies(sep="; ")
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
products_info_df = products_info_df.join(temp_sectors)
products_info_df.drop("Sector(s)", axis=1, inplace=True)

del temp_sectors

products_info_df['primary_function_main'] = products_info_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_info_df['primary_function_sub'] = products_info_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products_info_df['primary_function_sub'] = products_info_df['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_info_df.drop("Primary Essential Function", axis=1, inplace=True)

print(products_info_df.head())

In [None]:
districts_info_df.sort_values(by="district_id", ascending=True)

# Products Info

# Engagement Data
We are only considering districts with full 2020 engagement data to avoid some errors and bias on incomplete data.

In [None]:
%time

ENGAGEMENT_PATH = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data"

temp = []

for district in districts_info_df.district_id.unique():
    df = pd.read_csv(f'{ENGAGEMENT_PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    if df.time.nunique() == 366:
        temp.append(df)

all_engagement_df = pd.concat(temp)
all_engagement_df = all_engagement_df.reset_index(drop=True)

# Only consider districts with full 2020 engagement data
districts_info_df = districts_info_df[districts_info_df.district_id.isin(all_engagement_df.district_id.unique())].reset_index(drop=True)
products_info_df = products_info_df[products_info_df['LP ID'].isin(all_engagement_df.lp_id.unique())].reset_index(drop=True)

In [None]:
#Printing the total number of rowns in engagement
all_engagement_df = all_engagement_df[all_engagement_df.lp_id.isin(products_info_df['LP ID'].unique())]
print(len(all_engagement_df))

In [None]:
# Fix date column
all_engagement_df.time = all_engagement_df.time.astype('datetime64[ns]')

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

districts_info_df['state_abbrev'] = districts_info_df['state'].replace(us_state_abbrev)
districts_info_by_state = districts_info_df['state_abbrev'].value_counts().to_frame().reset_index(drop=False)
districts_info_by_state.columns = ['state_abbrev', 'num_districts']

# Number of Available School Districts per State

In [None]:
fig = go.Figure()
layout = dict(
    title_text = "Available School Districts in each State",
    geo_scope='usa',
)

fig.add_trace(
    go.Choropleth(
        locations=districts_info_by_state.state_abbrev,
        zmax=1,
        z = districts_info_by_state.num_districts,
        locationmode = 'USA-states', # set of locations match entries in `locations`
        marker_line_color='white',
        geo='geo',
        colorscale=px.colors.sequential.RdBu_r, 
    )
)
            
fig.update_layout(layout)   
fig.show()

In [None]:
#Checking the dtype of each column
all_engagement_df.info()

In [None]:
all_engagement_df.head()

In [None]:
# All engagement number of rows
all_engagement_df.shape

In [None]:
# Merging the districts with engagement data
engagement_district_df = all_engagement_df.merge(districts_info_df, on="district_id", how="right")
engagement_district_df.head()

In [None]:
# Merging the districts with engagement data2
engagement_district_df = all_engagement_df.merge(districts_info_df, on="district_id", how="right")
engagement_district_df.head()

In [None]:
# View number of registers
engagement_district_df.shape

In [None]:
engagement_district_df.dropna(subset=["lp_id"],inplace=True)
engagement_district_df.shape

In [None]:
# View NA in engagment_district_df 
engagement_district_df.isna().sum()

In [None]:
products_info_df.isna().sum()

# Number of available data by each State

In [None]:
pd.DataFrame(engagement_district_df.state.value_counts())

In [None]:
# Merge engagement_district_df with productos_info_df
# Rename column "LP ID"
prod_cols = products_info_df.columns.tolist()
prod_cols[0] = "lp_id"
products_info_df.columns=prod_cols
engagement_district_product_df = engagement_district_df.merge(products_info_df, on="lp_id", how="left")
engagement_district_product_df.shape

In [None]:
engagement_district_product_df[engagement_district_product_df["URL"].isna()]

# Lost values on the final table that include Engagement | District | Product
These lost values are cleaned

In [None]:
# NaN in engagement_district_product_df
pd.DataFrame(engagement_district_product_df.isna().sum())

In [None]:
# Delete rows with NA in engagement_index
engagement_district_product_df.dropna(subset=["engagement_index"],inplace=True)
engagement_district_product_df.shape

In [None]:
engagement_district_product_df.head()

# Number of values for each product

In [None]:
pd.DataFrame(engagement_district_product_df['Product Name'].value_counts())

In [None]:
# Separating data in pct_black/hispanic in different columns
engagement_district_product_df['pct_black'] = engagement_district_product_df['pct_black/hispanic'].map(lambda x: float(x.split(",")[0][1:]))
pd.unique(engagement_district_product_df['pct_black'])
engagement_district_product_df['pct_hispanic'] = engagement_district_product_df['pct_black/hispanic'].map(lambda x: float(x.split(",")[1][:-1]))
pd.unique(engagement_district_product_df['pct_hispanic'])

In [None]:
# Separate data in pct_free/reduced in diferent columns
engagement_district_product_df['pct_free'] = engagement_district_product_df['pct_free/reduced'].map(lambda x: float(str(x).split(",")[0][1:]), na_action='ignore')
pd.unique(engagement_district_product_df['pct_free'])
engagement_district_product_df['pct_reduced'] = engagement_district_product_df['pct_free/reduced'].map(lambda x: float(str(x).split(",")[1][:-1]), na_action='ignore')
pd.unique(engagement_district_product_df['pct_reduced'])

# Exploring the Final Table

In [None]:
# Final Dataframe
engagement_district_product_df.head()

# Interactive Graphics
In this section we will present a series of graphics in which you as a reader can interact and select new parameters, this section uses dropdowns to select the state and different parameters for the the Exploratory Data Analysis (May need to be runned inside Kaggle to be seen)

In [None]:
# Parameters
list_state = list(pd.unique(engagement_district_product_df['state']))
list_products = list(pd.unique(engagement_district_product_df['Product Name']))
list_companies = list(pd.unique(engagement_district_product_df['Provider/Company Name']))
list_locale = list(pd.unique(engagement_district_product_df['locale']))
list_date = list(pd.unique(engagement_district_product_df['time']))

In [None]:
def plot_engagement_state_product(state,product_name):
  global engagement_district_product_df
  df = engagement_district_product_df.loc[engagement_district_product_df['Product Name'] == product_name]
  df2 = df.loc[df['state'] == state]
  data = df2.groupby('time')['engagement_index'].mean()
  # print(df2.groupby('time')['engagement_index'].mean())
  plt.figure(figsize=(18,5))
  plt.plot(data)
  plt.show()

# Mean Engament index by State (dropdown) and product (dropdown)
Use the dropdowns to interact

In [None]:
# Mean Engament_index by date (with state and product_name as parameters)
interact(plot_engagement_state_product, state=list_state, product_name = list_products);

In [None]:
def plot_engagement_state_company(state,company):
  global engagement_district_product_df
  df = engagement_district_product_df.loc[engagement_district_product_df['Provider/Company Name'] == company]
  df2 = df.loc[df['state'] == state]
  data = df2.groupby('time')['engagement_index'].mean()
  # print(df2.groupby('time')['engagement_index'].mean())
  plt.figure(figsize=(18,5))
  plt.plot(data)
  plt.show()

# Mean Engament index by State (dropdown) and Companies (dropdown)
Use the dropdowns to interact

In [None]:
# Mean Engament_index by date (with state and company as parameters)
interact(plot_engagement_state_company, state=list_state, company = list_companies);

In [None]:
def plot_engagement_state_locale(state,locale):
  global engagement_district_product_df
  df = engagement_district_product_df.loc[engagement_district_product_df['locale'] == locale]
  df2 = df.loc[df['state'] == state]
  data = df2.groupby('time')['engagement_index'].mean()
  # print(df2.groupby('time')['engagement_index'].mean())
  plt.figure(figsize=(18,5))
  plt.plot(data)
  plt.show()

# Mean Engament index by date by State (Dropdown) and Locale (Dropdown)
Use the dropdowns to interact

In [None]:
# Mean Engament_index by date (with state and locale/type of state as parameters)
interact(plot_engagement_state_locale, state=list_state, locale = list_locale);

In [None]:
def map_by_product_time(product,time):
    global engagement_district_product_df
    df = engagement_district_product_df.loc[engagement_district_product_df['Product Name'] == product]
    df2 = df.loc[df['time'] == time]
    data = df2.groupby('state_abbrev')['engagement_index'].mean().to_frame().reset_index(drop=False)
    
    data.columns = ['state_abbrev', 'mean_engagement']
    
    fig = go.Figure()
    layout = dict(
        title_text = "Average Engagement Index by State",
        geo_scope='usa',
    )

    fig.add_trace(
        go.Choropleth(
            locations=data.state_abbrev,
            zmax=1,
            z = data.mean_engagement,
            locationmode = 'USA-states', # set of locations match entries in `locations`
            marker_line_color='white',
            geo='geo',
            colorscale=px.colors.sequential.RdBu_r, 
        )
    )

    fig.update_layout(layout)   
    fig.show()

# Mean Engament index by date by State (Dropdown) and Locale (Dropdown)
Use the dropdowns to interact

In [None]:
# Mean Engament_index by state in a map (with product and time as parameters)
interact(map_by_product_time, product=list_products, time = list_date);

# Most Popular
In this section we present an analysis of most popular products

In [None]:
# Which is the most popular company
plt.figure(figsize=(10,70))
engagement_district_product_df['Provider/Company Name'].value_counts().sort_values(ascending=True).plot.barh()
plt.title("Most Popular Provider Company")
plt.ylabel("Name")
plt.xlabel("Frequency");

In [None]:
# Which is the most used product
plt.figure(figsize=(10,10))
engagement_district_product_df['Product Name'].value_counts().sort_values(ascending=True)[0:15].plot.barh()
plt.xlabel("Frequency")
plt.title("Most Used Products");


In [None]:
# Which is the most used product
plt.figure(figsize=(18,5))
engagement_district_product_df['Product Name'].value_counts()[7:20].sort_values(ascending=True).plot.barh()
plt.ylabel("Frequency")
plt.xlabel("Products")
plt.title("Most Used Products w/o Google Suite");

In [None]:
# Which is the most popular product
plt.figure(figsize=(10,80))
engagement_district_product_df['Product Name'].value_counts().sort_values(ascending=True).plot.barh();
plt.title("Most Popular Product");

In [None]:
# Most popular products 
most_popular_products = engagement_district_product_df['Product Name'].value_counts()[0:20]
most_popular_products = most_popular_products.index.tolist()
most_popular_products

# Relation of Engagement and Deaths in USA
In this section we will present an analysis about the posibbly relation into engagement and number of deaths caused by Covid-19

In [None]:
# Import extra libraries
import datetime
import glob
from ipywidgets import interact, interactive, fixed, interact_manual

# Create a list with path to daily report of covid-19 data for every day of 2020
COVID_PATH = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us"
numdays = 366
base = datetime.date(2020, 1, 1)
date_list = [base + datetime.timedelta(days=x) for x in range(numdays)]
covid_path_list = []
temp = []
for date in date_list:
  day = str(date.day)
  if len(day)==1:
    day = "0"+day

  month = str(date.month)
  if len(month)==1:
    month = "0"+month

  path = COVID_PATH + "/"+month +"-" +day +"-" +str(date.year)+".csv"
  covid_path_list.append(path)

# Read and append only files from 4th of April (first case) to the end of year 2020.
for filename in covid_path_list[102:]:
  df_1 = pd.read_csv(filename, index_col=None, header=0)
  temp.append(df_1)

# Create database from extracted data
all_covid_cases_df = pd.concat(temp)
all_covid_cases_df = all_covid_cases_df.reset_index(drop=True)

In [None]:
engagement_district_product_df_1 = engagement_district_product_df[4065:]
print(all_covid_cases_df.head())
print(engagement_district_product_df_1.head())
print(engagement_district_product_df.head())

In [None]:
print(all_covid_cases_df.columns)
# Sepparate colum "Last_Update", which has hour and date of report, into a column that only has date, and another that only has hour
all_covid_cases_df[['time', 'LU_hour']] = all_covid_cases_df['Last_Update'].str.split(' ',4,expand=True)

In [None]:
# Function. Graphs reported cases of Covid-19 according to date, and Engagement of a Given product by State during 2020. 
# Two plots are made
def get_covid_graph(Province_State, product_name):
  global all_covid_cases_df
  df_c = all_covid_cases_df.loc[all_covid_cases_df['Province_State'] == Province_State]
  data_c = df_c.groupby('time')['Confirmed'].mean()
  plt.figure(figsize=(30,5))
  plt.plot(data_c)
  plt.show()

  global engagement_district_product_df_1
  df = engagement_district_product_df_1.loc[engagement_district_product_df_1['Product Name'] == product_name]
  df2 = df.loc[df['state'] == Province_State]
  data = df2.groupby('time')['engagement_index'].mean()
  
  plt.figure(figsize=(30,5))
  plt.plot(data)
  plt.show()

In [None]:
# Example of Function. Using Utah as a State and Google Drive as the product to analize engagement.
get_covid_graph('Utah', 'Google Drive')

In [None]:
# Use of Python's interact function to create a small UI to control Province State and Product Name interactively 
interact(get_covid_graph, Province_State=['Connecticut', 'Utah', 'Massachusetts', 'Illinois', 'California', 'Ohio', 'Missouri', 'Indiana', 'Washington', 'Virginia', 'North Carolina', 'New Jersey', 'New Hampshire', 'Michigan', 'District Of Columbia', 'Arizona', 'New York', 'Tennessee', 'Florida'], product_name=['Google Drive','YouTube','Google Classroom']);

# Forecasting of engagement_index


In [None]:
# Forecast based in product_name and state
def forecasting(product_name,state):
  global engagement_district_product_df
  df = engagement_district_product_df.loc[engagement_district_product_df['Product Name'] == product_name]
  df2 = df.loc[df['state'] == state]
  print(df2.shape)
  data = df2.groupby('time')['engagement_index'].mean().values
  print( df.loc[df['state'] == state])
  # fit model
  model = SARIMAX(data, order=(1, 0, 0), seasonal_order=(1, 0, 0, 12))
  model_fit = model.fit(disp=False)
  # make prediction
  yhat = model_fit.predict()
  print(len(yhat))
  return yhat

In [None]:
predictions = forecasting("Google Docs","Utah")
plt.figure(figsize=(18,5))
plt.plot(predictions)
plt.show()

In [None]:
#Weekend Eradication
def weekendEr(predictions):
  a=2
  global engagement_district_product_df
  arr=[]
  x=0
  
  while(x<366):
    x+=1
    if(x==a or x==a+1):
      if(x==a):
        arr.append(x)
      else:
        arr.append(x)
        a=a+6

  print(len(arr))
  print(predictions.shape)
  
  predictions2=pd.DataFrame(predictions)
  predictions2.drop(labels=arr, axis=0,  columns=None, level=None, inplace=True, errors='raise')
  pred=predictions2.to_numpy()
  print(predictions2.shape)
  return predictions2

In [None]:
#promedy for 5 days
def med(predictions):
  a=0
  sum=0
  arr=[]
  for x in range(365):
    if(a==4):
      arr.append(sum/5)
      sum=0
      a=0
    sum=sum+predictions[x]
    a+=1
  predictions2=pd.DataFrame(arr)
  return predictions2

In [None]:
#average data every 5 days
predictionsEradication = med(predictions)
print(predictionsEradication.head)
plt.figure(figsize=(18,5))
plt.plot(predictionsEradication)
plt.show()

In [None]:
#promedy for 5 days
def comparison(engagement_comparison):
  nmp=engagement_comparison.to_numpy()
  #First part of the year 152 days 
  a=0
  #we take out 92 days that are the summer vacation
  #the second part of the year will be conforme by 108 days because
  #we take out the last 2 weeks of december for vacations
  b=244
  sum=0
  sum2=0
  arr=[]
  for x in range(152):
    sum=sum+nmp[x]

  for y in range(108):
    sum2=sum2+nmp[y+244]

  arr.append(sum/152)
  arr.append(sum2/108)
  comp=pd.DataFrame(arr)
  return comp


In [None]:
#comparison of engagement in the first half of the year and in the last half of the year, taking out the vacations 
comp=comparison(engagement_district_product_df['engagement_index'])
print(comp.shape)
print(comp.head)
# print(engagement_district_product_df['engagement_index'].shape)

We can infer that in the first 2020 semester the engagement_index average was greather than the second 2020 semester