# COVID-19 Impact on Digital Learning

![Source: Indian Express](https://images.indianexpress.com/2021/02/teacher5.jpg)


# Import the Python libraries

In [None]:
import pandas as pd
import numpy as np  
import seaborn as sns 
pal = sns.color_palette("pastel")

import plotly
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import glob
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from plotly import tools
import re
from plotly.offline import init_notebook_mode, plot, iplot
from wordcloud import WordCloud, STOPWORDS 
from warnings import filterwarnings
filterwarnings('ignore')
import missingno as msno
import glob

import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage


# Load the data

In [None]:
districts_data=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
products_data=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_data = pd.concat(li)
engagement_data = engagement_data.reset_index(drop=True)

# Districts data
As we look at basic details of the District data, we find that besides details on states and location, we also get details on population of minorities (black/hispanic), fee reduction and population

We also note that missing values exist. 

In [None]:
display("Districts data", districts_data.head())
display(districts_data.info())

# Geo-Analysis: Clustering the school disctricts in the US Map

We observe that most schools in the analysis are in the North Western part of the US
(in the regions of area of New York and Chicago). 
The students are from suburbs and data is in many states with Connecticut and Utah having the most details

In [None]:
student = mpimg.imread('../input/covid-learning-pics/1.jpg')
imagebox = OffsetImage(student, zoom=.5)
xy = (0.5, 0.5)
ab = AnnotationBbox(imagebox, xy, frameon=False, pad=1, xybox=(2, 60))

plt.figure(figsize=(16, 6))
ax = sns.countplot(districts_data.locale)
ax.add_artist(ab)


plt.title("The dataset is primarily from Suburbs", fontsize=16)
plt.xticks(rotation=90, fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

In [None]:
student = mpimg.imread('../input/covid-learning-pics/0.jpg')
imagebox = OffsetImage(student, zoom=1)
xy = (0.5, 0.5)
ab = AnnotationBbox(imagebox, xy, frameon=False, pad=1, xybox=(20, 15))


plt.figure(figsize=(16, 10))
ax =sns.countplot(y="state",data=districts_data,order=districts_data.state.value_counts().index,palette="Blues",linewidth=3)
ax.add_artist(ab)

plt.title("The states with the most schools are Connecticut and Utah",font="Serif", size=20)
plt.show()

# Distribution of schools in states by locale

In [None]:
sns.displot(data=districts_data, x='state', hue= 'locale', height=8, aspect=3)

In [None]:
locations=pd.DataFrame({"Name":districts_data['state'].unique()})
geolocator=Nominatim(user_agent="app")
lat=[]
lon=[]
for location in locations['Name']:
    location = geolocator.geocode(location)    
    if location is None:
        lat.append(np.nan)
        lon.append(np.nan)
    else:
        lat.append(location.latitude)
        lon.append(location.longitude)
        
locations['lat']=lat
locations['lon']=lon        

Rest_locations=pd.DataFrame(districts_data['state'].value_counts().reset_index())

Rest_locations.columns=['Name','count']
final_loc=Rest_locations.merge(locations,on='Name',how="left").dropna()
final_loc.head(15).style.set_caption("Locations Dataframe").set_properties(**{'background-color': 'peachpuff',
                           'color': 'midnightblue','border': '1.5px solid black'})

def generateBaseMap(default_location=[37.0902, -95.7129], default_zoom_start=4):
    base_map = folium.Map(location=default_location, zoom_start=default_zoom_start)
    return base_map

basemap=generateBaseMap()

FastMarkerCluster(data=final_loc[['lat','lon','count']].values.tolist()).add_to(basemap)

basemap

# Most states have low percentage of of students in the districts identified as Black or Hispanic

In [None]:
pct_black_hispanic = districts_data['pct_black/hispanic'].str.split(",",n=1,expand=True)
districts_data['pct_black']=pct_black_hispanic[0].str.replace('[','',regex=True)
districts_data['pct_hispanic']= pct_black_hispanic[1].str.replace('[','',regex=True)
districts_data['pct_black']=pd.to_numeric(districts_data['pct_black'])
districts_data['pct_hispanic']=pd.to_numeric(districts_data['pct_hispanic'])
districts_data['pct_black_and_hispanic']=(districts_data['pct_black'] + districts_data['pct_hispanic'])/2 # combine with mean value
sns.displot(data=districts_data, x="pct_black_and_hispanic", hue='state', height=8, aspect=.8)

# Great to see States offer free/reduced fee schools

In [None]:

student = mpimg.imread('../input/covid-learning-pics/3.jpg')
imagebox = OffsetImage(student, zoom=.6)
xy = (0.5, 0.5)
ab = AnnotationBbox(imagebox, xy, frameon=False, pad=1, xybox=(3.5, 30))


plt.figure(figsize=(16, 12))
ax =sns.countplot(x ='pct_free/reduced',data = districts_data,order=districts_data['pct_free/reduced'].value_counts().index)
ax.add_artist(ab)

plt.title("Some states offer over 60% students eligible for free/reduced fees",font="Serif", size=20)
plt.show()

In [None]:
pct_free_reduced = districts_data['pct_free/reduced'].str.split(",",n=1,expand=True)
districts_data['pct_free']=pct_free_reduced[0].str.replace('[','',regex=True)
districts_data['pct_reduced']= pct_free_reduced[1].str.replace('[','',regex=True)
districts_data['pct_free']=pd.to_numeric(districts_data['pct_free'])
districts_data['pct_reduced']=pd.to_numeric(districts_data['pct_reduced'])
districts_data['pct_free_and_reduced']=(districts_data['pct_free'] + districts_data['pct_reduced'])/2

sns.displot(data=districts_data, x="pct_free_and_reduced", hue='state', height=8, aspect=.8)

# Products data

As we look at basic details of the Products data, we find details on products used, providors, sector and their primary function.

We also note that missing values exist. 

As most of the data have a lot of unique categories, I have used plotly so that the charts are dynanmic and can be zoomed in/out as per the viewers preference

In [None]:
display("Products data", products_data.head())

display(products_data.info())

In [None]:
sectors= products_data['Sector(s)'].value_counts()
fig = go.Figure(data=[go.Bar(y=sectors.values, x=sectors.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))

# Show chart
fig.update_layout(title='PreK-12 is the largest Sector')
fig.show()

In [None]:
prim_fn= products_data['Primary Essential Function'].value_counts()
fig = go.Figure(data=[go.Bar(y=prim_fn.values, x=prim_fn.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))

# Show chart
fig.update_layout(title='Digital Learning Platforms are the primary essential function')
fig.show()

In [None]:
provider= products_data['Provider/Company Name'].value_counts()
fig = go.Figure(data=[go.Bar(y=provider.values, x=provider.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))

# Show chart
fig.update_layout(title='Google is the largest provider, followed by Houghton Mifflin Harcourt and Microsoft')
fig.show()

# Engagement data

In [None]:
display("Engagement data",engagement_data.head())

display(engagement_data.info())

In [None]:
def plot_time_series(df,col1,col2,col3):
    max_list = df[[col1,col2]]\
        .groupby([col1])[col2].mean()\
        .sort_values(ascending=False).index[:5].tolist()

    df = df[df[col1].isin(max_list)]\
                    .reset_index(drop=True)[[col3, col1, col2]]
    df = df.pivot_table(index=col3, columns=col1, values=col2)

    fig = px.line(df, facet_col=col1, facet_col_wrap=1, width=800, height=800)
    fig.update_layout(
                      title=(col1 + " , " + col2 + " , " + col3).title(),
                      title_x=0.39,
                      template="plotly",
                      paper_bgcolor='#f5f7f7',
                      font = {'family': 'Serif', 'size': 15}
                     )
    fig.show()
    
engagement_data["district_id"] = engagement_data["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_data, engagement_data, left_on='district_id', right_on='district_id')


plot_time_series(districts_engagement_data,"state","engagement_index","time")



**work in progress -- will add more in a few days!**

# Sources and References

* https://www.kaggle.com/muhammadimran112233/covid-19-impact-on-digital-learning
* https://www.kaggle.com/pranjalverma08/exploring-impact-of-covid-19-on-digital-learning
* https://www.kaggle.com/ruchi798/covid-19-impact-on-digital-learning-eda-w-b
* https://www.kaggle.com/girishkumarsahu/learnplatform-covid-19-impact
* https://www.kaggle.com/saurabhbagchi/covid-19-digital-learning-impact
