# <center><img src="https://www.fulcrumlabs.ai/wp-content/uploads/2018/04/learning-data-usability.gif"></center>

<div>  
    <h3 style="color:Gold;"><strong>This notebook will analyze the digital learning data based on Learning Analytics skills.</strong></h3>
    <p style="background-color:Gold; color:Ivory;">ABOUT THE DATA:</p>
    <p style="background-color:Gold; color:Ivory;">This data collected from over 200 school districts in 2020, including three basic sets of files to help you get started:</p>
    <p style="background-color:Gold; color:Ivory;">The engagement_ data folder is based on LearnPlatformâ€™s Student Chrome Extension. The extension collects page load events of over 10K education technology products in our product library, including websites, apps, web apps, software programs, extensions, ebooks, hardwares, and services used in educational institutions. The engagement data have been aggregated at school district level, and each file represents data from one school district. </p>
    <p style="background-color:Gold; color:Ivory;">The products_info.csv file includes information about the characteristics of the top 372 products with most users in 2020. </p>
    <p style="background-color:Gold; color:Ivory;">The districts_info.csv file includes information about the characteristics of school districts, including data from NCES and FCC. </p>
    <p style="background-color:Gold; color:Ivory;">The definitions of each column in the three data sets are detailed in the README file.</p>
    <i></i>
</div>

In [None]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import glob
import geopandas as gpd
import plotly.graph_objects as go

In [None]:
di=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
di.head()

In [None]:
pr=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
pr.head()

<div>  
<h2 style="color:Gold;"><center><strong>Exploratory Data Analysis</strong></center></h2>
        
</div>

**About districts_info:**

In [None]:
di.info()

In [None]:
di.isnull().sum()

In [None]:
di.groupby('state')['state'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="state",data=di,order=di.state.value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("State Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("State",size=12)
plt.show()

In [None]:
di.groupby('locale')['locale'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="locale",data=di,order=di.locale.value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Locale Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Locale",size=12)
plt.show()

In [None]:
di.groupby('pct_black/hispanic')['pct_black/hispanic'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="pct_black/hispanic",data=di,order=di["pct_black/hispanic"].value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Pct_black/hispanic Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Pct_black/hispanic",size=12)
plt.show()

In [None]:
di.groupby('pct_free/reduced')['pct_free/reduced'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="pct_free/reduced",data=di,order=di["pct_free/reduced"].value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Pct_free/reduced Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Pct_free/reduced",size=12)
plt.show()

In [None]:
di.groupby('county_connections_ratio')['county_connections_ratio'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="county_connections_ratio",data=di,order=di["county_connections_ratio"].value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("County_connections_ratio Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("County_connections_ratio",size=12)
plt.show()

In [None]:
di.groupby('pp_total_raw')['pp_total_raw'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="pp_total_raw",data=di,order=di["pp_total_raw"].value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Pp_total_raw Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Pp_total_raw",size=12)
plt.show()

**About products_info:**

In [None]:
pr.info()

In [None]:
pr.isnull().sum()

In [None]:
pr.groupby('Product Name')['Product Name'].count().sort_values(ascending=False)

In [None]:
pr.groupby('Provider/Company Name')['Provider/Company Name'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="Provider/Company Name",data=pr,order=pr["Provider/Company Name"].value_counts().index[:5],color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Top 5 Provider/Company Name Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Provider/Company Names",size=12)
plt.show()

In [None]:
pr.groupby('Sector(s)')['Sector(s)'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="Sector(s)",data=pr,order=pr["Sector(s)"].value_counts().index,color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Sector Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Sector",size=12)
plt.show()

In [None]:
pr.groupby('Primary Essential Function')['Primary Essential Function'].count().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="Primary Essential Function",data=pr,order=pr["Primary Essential Function"].value_counts().index[:10],color="Gold",linewidth=2,facecolor=(0, 0, 0, 0),edgecolor=sns.color_palette("BrBG", 2))
plt.title("Top10 Primary Essential Function Distribution",size=15)
plt.xlabel("Count",size=12)
plt.ylabel("Primary Essential Function",size=12)
plt.show()

**About engagement_data:**

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
en = pd.concat(li)
en = en.reset_index(drop=True)
en.head()

In [None]:
en['date'] = pd.to_datetime(en['time']).dt.date
en['month']= pd.to_datetime(en['time']).dt.month_name()
en['weekday']= pd.to_datetime(en['time']).dt.day_name()

In [None]:
en.head()

In [None]:
en.describe()

In [None]:
en.groupby('district_id')['pct_access'].mean()

In [None]:
month_mean_serie = en.groupby('month')['pct_access'].mean()
en['mean_access_month'] = en['month'].map(month_mean_serie)
en.head()

In [None]:
e1 = en.groupby('month')['pct_access'].mean().reset_index()
e1

In [None]:
plt.figure(figsize=(12, 6))
plt.plot( 'month', 'pct_access', data=e1, linestyle='-', marker='o')
plt.show()

In [None]:
e2 = en.groupby('weekday')['pct_access'].mean().reset_index()
e2

In [None]:
plt.figure(figsize=(12, 6))
plt.plot( 'weekday', 'pct_access', data=e2, linestyle='-', marker='o')
plt.show()

In [None]:
print(pr["LP ID"].nunique())
print(en["lp_id"].nunique())

In [None]:
pr_en = pd.merge(pr, en, left_on='LP ID', right_on='lp_id') 
pr_en.head()

In [None]:
plt.figure(figsize=(18, 6))
sns.violinplot(data=pr_en, x="Sector(s)", y="pct_access",
               split=True, inner="quart", linewidth=1)
plt.ylim(0,1)
sns.despine(left=True)
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
corr_matrix=pr_en.corr()
mask = np.zeros_like(corr_matrix)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr_matrix, mask=mask, square=True)