# ANALYSIS LEARNPLATFORM COVID-19 IMPACT

In [None]:
import numpy as np 
import pandas as pd 
import glob 
import os 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.graph_objects as go

import folium
from folium import plugins
from geopy.geocoders import Nominatim
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster, FastMarkerCluster
from wordcloud import WordCloud, STOPWORDS 

In [None]:
districts_data = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products_data = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_data = pd.concat(li)
engagement_data = engagement_data.reset_index(drop=True)
engagement_data.head()

In [None]:
print('Shape of districts info dataset is:', districts_data.shape)
districts_data.info()

# DISTRICTS DATA

The district file includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. In this data set, we removed the identifiable information about the school districts.

#### **Q**: What does data looks like for districts info dataset?

In [None]:
districts_data.head(10)

#### **Q**: What is the statistics description for districts info dataset?

In [None]:
districts_data.describe(include='all')

* There are 23 **state**(s) for which data points are available.
* Connecticut **state** has the highest frequency of occurrence.
* Suburp **locale** has the highest frequency 
* There are 4 **locale** in the data

#### **Q**: Is there any missing value in the dataset

In [None]:
districts_data.isna().sum()

#### **Q**: What is the distribution count for locale?

In [None]:
districts_data['locale'].value_counts()

In [None]:
districts_data['locale'].value_counts().plot(kind='pie', explode=[0.1,0.1,0.1,0.1], fontsize=12, autopct='%3.1f%%', 
                                               figsize=(5,5), startangle=135, legend=False, colors=['#ff760d','#ffa159','#ffdbbf','#ff9d5c'])
plt.ylabel('')
plt.xlabel('Most Locale', size=15)

Most of the US territory data was from Suburb. Suburban areas are lower density areas that separate residential and commercial areas from one another. They are either part of a city or urban area, or exist as a separate residential community within commuting distance of a city.

#### **Q**: What is the distribution count for state?

In [None]:
districts_data['state'].value_counts()

In [None]:
plt.figure(figsize=(16, 10))
sns.countplot(y="state",data=districts_data,order=districts_data.state.value_counts().index,palette="Oranges",linewidth=3)
plt.title("State Distribution",font="Serif", size=20)
plt.show()

Connecticut has the most number of district representation with 30 district counts in the dataset closely followed by Utah. A great number of the education insituions are located in the Suburbs **but does this result in bettter education How do they compare to the other locales?**

#### **Q**: What is the distribution between state and locale?

In [None]:
plt.figure(figsize=(20,10))
ax=sns.countplot(data=districts_data,x='state',hue='locale',palette='Oranges')
plt.xticks(rotation=90)
plt.title("State and its Locality")
for p in ax.patches:
    ax.text (p.get_x() + p.get_width()  / 2,p.get_height()+ 0.75,p.get_height(), fontsize = 11)

From the visualization above, most of locale in Connecticut was from Suburban followed by Rural and City. The least data came from Minesota, Arizona, Florida, and North Dakota
* There are 23 States and 4 locale type
* Connecticut has the highest frequency
* Suburb is the highest frequency of locale type while Town is the other way

#### Let's Finding out about pct_black/hispanic, pct_free/reduced and pptotal/raw

In [None]:
districts_data

A quick intro to interval notation:

***]a,b[ := {: a<x <b }* : open Real interval**

***[a,b[ := {a<= x <b}* : Half-open on the right**

***]a,b] := {a<xb<=b}* : Half-open on the left**

Let's convert the data into float type

In [None]:
districts_copy=districts_data.copy()
districts_copy.dropna(inplace=True)

districts_copy['pct_black/hispanic']=districts_copy['pct_black/hispanic'].apply(lambda x :float(x.split(',')[0][1:])+0.1)
districts_copy['pct_free/reduced']=districts_copy['pct_free/reduced'].apply(lambda x :float(x.split(',')[0][1:])+0.1)

districts_copy=districts_copy.reset_index()

In [None]:
districts_copy.drop(labels='index',inplace=True,axis=1)
districts_copy

In [None]:
districts_copy['pp_total_raw']=districts_copy['pp_total_raw'].apply(lambda x :float(x.split(',')[0][1:])+1000)
districts_copy['county_connections_ratio']=districts_copy['county_connections_ratio'].apply(lambda x: float(x.split(',')[0][1:])+0.1)

districts_copy

In [None]:
state_locale_df=districts_copy.groupby(['state','locale']).agg({'pct_black/hispanic':np.mean,'pct_free/reduced':np.mean,'pp_total_raw':np.mean})
state_locale_df=state_locale_df.reset_index()
state_locale_df.head(10)

In [None]:
state_pct=districts_copy.groupby('state').agg({'pct_black/hispanic':np.mean,'pct_free/reduced':np.mean,'pp_total_raw':np.mean})

In [None]:
state_pct=state_pct.reset_index()

#### Visualization of percentage free/reduce, percentage black/hispanic and percentage total

In [None]:
plt.figure(figsize = (15, 8))
a = sns.barplot(data = state_pct, y = 'state', x = 'pct_free/reduced', palette = 'Oranges')
plt.yticks(fontsize = 14, color = '#283655')
plt.ylabel('Pct_free/Reduced fee')
plt.xlabel('State')

a.spines['left'].set_linewidth(1.5)
for w in ['right', 'top', 'bottom']:
    a.spines[w].set_visible(False)
    
plt.show()

Percentage of students in the districts eligible for free or reduced-price lunch was mostly from Minnesota

In [None]:
plt.figure(figsize = (15, 8))
a = sns.barplot(data = state_pct, y = 'state', x = 'pct_black/hispanic', palette = 'Oranges')
plt.yticks(fontsize = 14, color = '#283655')
plt.ylabel('Pct_black/hispanic')
plt.xlabel('State')

a.spines['left'].set_linewidth(1.5)
for w in ['right', 'top', 'bottom']:
    a.spines[w].set_visible(False)
    
plt.show()

Percentage of students in the districts identified as Black or Hispanic was mostly from Texas

In [None]:
plt.figure(figsize=(15,15))
sns.catplot(y="pp_total_raw", x="state",hue="locale",palette="rocket",data=state_locale_df, kind="bar",height=7,aspect=2)
plt.xticks(rotation=60);
plt.title("Distribution of state percentage PP Total Raw")

From the chart above, was the distribution of percentage total. Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools project.

# PRODUCT DATA

The product file "products_info.csv" includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

#### **Q**: What product data looks like?

In [None]:
products_data.describe(include='all')

#### **Q**: What is the top 10 Provider/Company names learning platform during Covid-19?

In [None]:
plt.figure(figsize=(16, 10))
sns.countplot(y='Provider/Company Name', data=products_data, 
              order=products_data["Provider/Company Name"].value_counts().index[:10],palette = 'coolwarm')
plt.title("Top 10 Provider/Company Names",font="Serif", size=20)
plt.show()

Google seems to have the hghest share of products used with 30 of them , Microsoft has the second most products in the market with 6 tied with Houghton Mifflin Harcourt

#### **Q**: What is the sector distribution?

In [None]:
c1=c2=c3=0
for s in products_data["Sector(s)"]:
    if(not pd.isnull(s)):
        s = s.split(";")
        for i in range(len(s)):
            sub = s[i].strip()
            if(sub == 'PreK-12'): c1+=1
            if(sub == 'Higher Ed'): c2+=1
            if(sub == 'Corporate'): c3+=1

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector Distribution', size = 30, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [c1,c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#ff8303","#fedebe","#ffaf42"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

Sector distribution was mostly from PreK-12. It's mean, most of the education services that exists in this dataset was for kindergarten to 12th grade students

#### **Q**: What is the most primary essential main function and the its distribution?

In [None]:
primary_essential_main = []
primary_essential_sub = []
for s in products_data["Primary Essential Function"]:
    if(not pd.isnull(s)):
        s1 = s.split("-",1)[0].strip()
        primary_essential_main.append(s1)
    else:
        primary_essential_main.append(np.nan)
    
    if(not pd.isnull(s)):
        s2 = s.split("-",1)[1].strip()
        primary_essential_sub.append(s2)
    else:
        primary_essential_sub.append(np.nan)

products_data["primary_essential_main"] = primary_essential_main
products_data["primary_essential_sub"] = primary_essential_sub

In [None]:
c1=c2=c3=0
for s in products_data["primary_essential_main"]:
    if(not pd.isnull(s)):
        c1 += s.count("CM")
        c2 += s.count("LC")
        c3 += s.count("SDO")

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Primary Essential Function', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['CM','LC','SDO']
sizes = [c1, c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#fedebe", "#ff8303","#ffaf42"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

Primary Essentials main function was for LC, it's mean used for Learning center

#### **Q**: What is the sub function from its platform? and count all the function

In [None]:
plt.figure(figsize=(16, 20))
sns.countplot(y='primary_essential_sub', data=products_data, order=products_data["primary_essential_sub"].value_counts().index, palette = "Oranges")
plt.title("Primary Essential Function(Sub)",font="Serif", size=20)
plt.show()

Primary Essential sub function was mostly for Digital Learning Platform

# ENGAGEMENT DATA

In [None]:
engagement_data.describe(include='all')

The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district. The 4-digit file name represents district_id which can be used to link to district information in district_info.csv. The lp_id can be used to link to product information in product_info.csv. In this section, we will analyse Engagement data. The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district.

In [None]:
engagement_data.dropna(inplace=True)

In [None]:
engagement_data.reset_index(drop=True,inplace=True)

In [None]:
engagement_data['time']=pd.to_datetime(engagement_data['time'])

In [None]:
engagement_data['month'] = engagement_data['time'].dt.month
engagement_data['date'] = engagement_data['time'].dt.day
engagement_data['date_week']=engagement_data['time'].dt.weekday
engagement_data

In [None]:
date_agg = engagement_data.groupby(['date','month','date_week']).agg({'engagement_index':np.mean,'pct_access':np.mean})
date_agg

In [None]:
date_agg=date_agg.reset_index()

In [None]:
plt.figure(figsize=(15,11))
sns.lineplot(y=date_agg['pct_access'],x=date_agg["month"])
plt.title("Average access per month", size=20)

From the chart above, its showing the average user access the platfrom for each months. From June to July, there was a significant decrease, one of the probability was because of Summer holiday

In [None]:
plt.figure(figsize=(15,11))
sns.lineplot(y=date_agg['engagement_index'],x=date_agg["month"],palette='rocket')
plt.title("Average Engagemnet Index per month")

From the chart above, its showing the average engagement index for each months. From June to July, there was a significant decrease, one of the probability was because of Summer holiday

# MERGING DATA

In this section, we will try to merge the data for showing engagement index for each state in US, and make the visualization with map from Folium. And also make a visualization about Engagement and its product

In [None]:
engagement_data['time'] = pd.to_datetime(engagement_data['time'])

In [None]:
products_engagement_data = pd.merge(products_data, engagement_data, left_on='LP ID', right_on='lp_id')
products_engagement_data.head()

In [None]:
#get the product name data base on average engagement index and sort it
top_p=products_engagement_data.groupby(['Product Name', 'Primary Essential Function'], as_index=False)['engagement_index'].mean()
top_p=top_p.sort_values(by=['engagement_index'],ascending=False)

In [None]:
plt.figure(figsize = (16,8))
sns.barplot(data=top_p.head(10), x="Product Name", y= "engagement_index", palette='Oranges')
plt.title('Top 10 LearnPlatform Product in 2020 (All District)',size=15)
plt.xlabel('Company Name',size=12)
plt.ylabel('Engagement index',size=12)
locs, labels = plt.xticks()
sns.despine()
plt.show()

From the chart above, Google docs is the most used platform in US. Google Docs is an online word processor included as part of the free, web-based Google Docs Editors suite offered by Google, which also includes Google Sheets, Google Slides, Google Drawings, Google Forms, Google Sites, and Google Keep. Google Docs is accessible via an internet browser as a web-based application and is also available as a mobile app on Android and iOS and as a desktop application on Google's Chrome OS.

In [None]:
top_c=products_engagement_data.groupby(['Primary Essential Function'], as_index=False)['engagement_index'].mean()
top_c=top_c.sort_values(by=['engagement_index'],ascending=False)

plt.figure(figsize = (16,8))
sns.barplot(data=top_c[:10], y="Primary Essential Function", x= "engagement_index", palette="Oranges")
plt.title('Top 5 Category Platform in 2020 (All District)', size=15)
sns.despine()
plt.show()

As we can see from the chart above, SDO - Learning Management System(LMS) is the most used category in US and followed by LC - Online Course Providers & Technical Skills Development

In [None]:
engagement_data["district_id"] = engagement_data["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_data, engagement_data, left_on='district_id', right_on='district_id')
districts_engagement_data.head()

In [None]:
locations = pd.DataFrame(districts_engagement_data.groupby('state').agg({'engagement_index':np.mean}).reset_index())
locations.head()

In [None]:
geolocator=Nominatim(user_agent="app")
loc = []
lat = []
lon = []
for location in locations['state']:
    location = geolocator.geocode(location)  
    loc.append(location)
    if location is None:
        lat.append(np.nan)
        lon.append(np.nan)
    else:
        lat.append(location.latitude)
        lon.append(location.longitude)

In [None]:
locations['lat'] = lat
locations['lon'] = lon

In [None]:
locations.head()

In [None]:
def generateBaseMap():
    base_map = folium.Map(location = [37.0902, -95.7129], zoom_start = 4)
    return base_map

In [None]:
basemap=generateBaseMap()

In [None]:
HeatMap(locations[['lat','lon','engagement_index']],zoom=20,radius=20).add_to(basemap)

In [None]:
average_engagement = plugins.MarkerCluster().add_to(basemap)
for lat, long, label, in zip(locations.lat, locations.lon, locations.engagement_index):
    folium.Marker(
        location=[lat,long],
        icon=None,
        popup=label,
    ).add_to(average_engagement)

basemap

From the visualization above, it's the marker and heatmap distribution engagement index in each state in US