In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%ls "../input"

In [None]:
import pandas as pd
# Load product data
product = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv",
                      header=0,sep=",")

In [None]:
# Assess data
product.sample(n=5)

In [None]:
# Dataset size
print("Number of rows",product.shape[0])
print("Number of columns",product.shape[1])

In [None]:
# Get info on features
product.info()

![Who are the main providers?](http://)

In [None]:
# Simplify the names of features
product.rename(columns=({"Product Name":"Product",
                         "Provider/Company Name":"Provider","Sector(s)":"Sector",
                         "Primary Essential Function":"Usage"})
               ,inplace=True)

In [None]:
# check new header
product.head(2)

In [None]:
# Identify missing values in products' dataframe
nulls_ = product.isnull()
N_nulls = nulls_.sum()
N_nulls


In [None]:
# Assess rows with nulls
product[product["Sector"].isnull()]

We note that all missing values from Provider, Sector and Usage are in these 20 rows.
Given the small size of the dataset, we will avoid deleting them as far as possible.
We can find some of these missing values by visiting the websites and using classes
defined in Sector and Usage. This is the approach taken.

In [None]:
# We have been able to find these missing entries.

for i in [61,293,314,352,354,370]:
    product.loc[i,"Sector"] = "PreK-12"

product.loc[356,"Sector"] = "PreK-12; Higher Ed"

product.loc[61,"Usage"] = "LC - Study Tools"
product.loc[293,"Usage"] = "CM - Classroom Engagement & Instruction - Communication & Messaging"
product.loc[314,"Usage"] = "LC - Sites, Resources & Reference"
product.loc[352,"Usage"] = "LC - Sites, Resources & Reference"
product.loc[354,"Usage"] = "LC - Sites, Resources & Reference - Streaming Services"
product.loc[356,"Usage"] = "LC - Online Course Providers & Technical Skills Development"
product.loc[370,"Usage"] = "LC - Digital Learning Platforms"


In [None]:
# These entries will have to be dropped. Either the websites have been taken down or
# or the classification was not possible

dropList = list(product[product["Sector"].isnull()].index)
dropList

In [None]:
product.drop(dropList,axis=0,inplace=True)
product.shape

In [None]:
# Focus only on students and learning
# remove functions associated with school management, techer education etc

irrelevant = ["SDO - Data, Analytics & Reporting - Site Hosting & Data Warehousing","CM - Teacher Resources - Professional Learning",
"LC - Career Planning & Job Search","CM - Teacher Resources - Professional Learning",
"CM - Teacher Resources - Lesson Planning",
"SDO - School Management Software - Mobile Device Management",
"SDO - Large-Scale & Standardized Testing",
"SDO - School Management Software - SSO",
"CM - Teacher Resources - Grading & Attendance",
"SDO - Environmental, Health & Safety (EHS) Compliance",
"SDO - Admissions, Enrollment & Rostering"]
relevant_prod = product[product.apply(lambda x:x["Usage"] not in irrelevant,axis=1)] 

In [None]:
relevant_prod.shape

Here we identify the main digital educational resource providers

In [None]:
# Who are the main service providers?

df_provider = pd.DataFrame(relevant_prod["Provider"].value_counts())
top10_providers = df_provider.head(10)


import matplotlib.pyplot as plt
import seaborn as sns
sns.barplot(data=top10_providers,x=top10_providers.index,y="Provider")
plt.xlabel("Providers")
plt.xticks(rotation=90)
plt.title("TOP educational digital service providers")

In [None]:
# Making a dataframe with only the top 10 providers
mini_df = relevant_prod[relevant_prod.apply(lambda x:x["Provider"] 
                                            in list(top10_providers.index),
                                            axis=1)]

In which sectors did the top10 providers supplied resources?

In [None]:
from pandas.plotting import parallel_coordinates
plt.figure(figsize=(10,20))
parallel_coordinates(mini_df.iloc[:,3:6],"Usage",colormap="gist_rainbow")
plt.title("Classes for which resources were provided")
plt.show()

All providers supplied for PreK-12. We will run all analysis on these classes, because
this is the group which is of major interest when it comes to access to and use of technology

In [None]:
mini_df_service = pd.DataFrame(mini_df.groupby(["Provider","Usage"])["Usage"].count())

In [None]:
# Load engagement data and construct one dataframe df
import glob
 
path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/*.csv"

df = pd.DataFrame(columns=["time","lp_id","pct_access","engagement_index"])

for file in glob.glob(path):
    df = pd.concat([df,pd.read_csv(file,header=0,sep=",")],axis=0)

In [None]:
df.shape

Massive data set!!

In [None]:
df.sample(n=20)

In [None]:
noMissing = df.isnull().sum()

In [None]:
noMissing

In [None]:
list_nan = df[df['lp_id'].isnull()].index.to_list()
df.drop(index=list_nan,axis=0,inplace=True)

In [None]:
df["lp_id"] = df["lp_id"].astype(int)


In [None]:
from datetime import datetime

df["time"] = df["time"].map(lambda x:datetime.strptime(x,'%Y-%m-%d'))
df["day"] = df["time"].map(lambda x:x.day)
df["month"] = df["time"].map(lambda x:x.month)
df["Year"] = df["time"].map(lambda x:x.year)


In [None]:
df.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
fig,axes = plt.subplots(1,2,figsize=(12,10),sharey=True)
time_ = ["month","day"]
for i in [0,1]:
  sns.scatterplot(ax=axes[i],data=df,x="pct_access",y="engagement_index",hue=time_[i])

There is a non-linear (almost quadratic) relationship between pct_access and engagement.
The engagement increases with greater access, but also towards the last 3 months of the year.

There is a non-linear (almost quadratic) relationship between pct_access and engagement.
The engagement level increases with greater internet access. Towards the last 3 months of the year, a significant increase is also noted.

In [None]:
DF = product.copy()
df.rename(columns={"lp_id":"LP ID"},inplace=True)
df_joined = pd.merge(DF,df,on=["LP ID"])

In [None]:
# Usage by Product
val_count = pd.DataFrame(df_joined["Product"].value_counts())
val_count["Count"] = val_count["Product"]
val_count.drop(["Product"],inplace=True,axis=1)
val_count["%"] = val_count["Count"]/val_count["Count"].sum()*100
val_count

plt.figure(figsize=(30,10))
top_50 = val_count.head(50).copy()
sns.barplot(data = top_50,x=top_50.index,y="%")
plt.xticks(rotation=90)

What are the primary functions of the services?

In [None]:
val_count_service = pd.DataFrame(df_joined["Usage"].value_counts())
val_count_service["Count"] = val_count_service["Usage"]
val_count_service.drop(["Usage"],inplace=True,axis=1)
val_count_service["%"] = val_count_service["Count"]/val_count_service["Count"].sum()*100

In [None]:
plt.figure(figsize=(30,10))
top_50_service = val_count_service.head(50).copy()
sns.barplot(data = top_50_service,x=top_50_service.index,y="%")
plt.xticks(rotation=90)

Services which were more used at PreK-12 : LC Digital Learning Platforms
Rousource sites, Games and Simulations followed far behind

**What were the most engaging products?**

In [None]:
grouped_tools = pd.DataFrame(df_joined.groupby(["Product"])["engagement_index"].mean())

In [None]:
engagement_by_tool = grouped_tools["engagement_index"].sort_values(ascending=False)
engagement_by_tool_top_60 = pd.DataFrame(engagement_by_tool.head(20))

In [None]:
plt.figure(figsize=(30,10))
sns.barplot(data = engagement_by_tool_top_60,x=engagement_by_tool_top_60.index,y="engagement_index")
plt.xticks(rotation=90)

**The most used software was not necessarily the most engaging**

In [None]:
grouped_function = pd.DataFrame(df_joined.groupby(["Usage"])["engagement_index"].mean())


In [None]:
engagement_by_function = grouped_function["engagement_index"].sort_values(ascending=False)
engagement_by_function_top_60 = pd.DataFrame(engagement_by_function.head(20))

In [None]:
plt.figure(figsize=(30,10))
sns.barplot(data = engagement_by_function_top_60,x=engagement_by_function_top_60.index,y="engagement_index")
plt.xticks(rotation=90)

**The Learning management systems were found to be more engaging**

# Analysis of access to digital services

In [None]:
# Load data
district = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv",header=0,sep=",")
district 

In [None]:
state_null_list = set(district[district["state"].isnull()].index)
locale_null_list = set(district[district["locale"].isnull()].index)

In [None]:
locale_null_list.difference(state_null_list)

In [None]:
district.drop(state_null_list,axis=0,inplace=True)
district.isnull().sum()

In [None]:
#district["pct_black/hispanic"].value_counts()

In [None]:
district

In [None]:
#district[district["pct_free/reduced"].isnull()]

In [None]:
#district["pct_free/reduced"].value_counts()

In [None]:
# Convert interval data into a sigle number using mean of boundary points
import numpy as np

def split_function(data):
    data = data.replace("[","")
    data = data.replace(" ","").split(",")
    a = float(data[0])
    b = float(data[1])
    return (a+b)/2

for features in ["pct_black/hispanic","pct_free/reduced","county_connections_ratio","pp_total_raw"]:
    district[features] = district[features].map(lambda x: split_function(x) if isinstance(x,str) else np.nan)

In [None]:
district.shape
district.isnull().sum()

In [None]:
# Drop na
reduced_dataset = district.dropna(axis=0)
reduced_dataset.shape

In [None]:
import statsmodels.api as sm
X = reduced_dataset["pct_black/hispanic"]
Y = reduced_dataset["pct_free/reduced"]
model = sm.OLS(Y,X)
results = model.fit()
print("R2: ", results.rsquared)


There is a strong linear correlation between pct_black/hispanic and beneficiaries of the pct_free/reduced incentive

In [None]:
ind = list(district[district["pct_free/reduced"].isnull()].index)
for i in ind:
   x = district.loc[i,"pct_black/hispanic"] 
   district.loc[i,"pct_free/reduced"] = results.predict([x])[0]

In [None]:
reduced_dataset

In [None]:
correl = reduced_dataset.corr()
sns.heatmap(correl,annot=True)

In [None]:
new_index = list(district[district["pp_total_raw"].isnull()]["locale"].index)
reduced_dataset = district.drop(new_index,axis=0)


In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
Z = np.array(reduced_dataset["pp_total_raw"]).reshape(-1,1)
sc = RobustScaler()
reduced_dataset["pp_total_raw_trans"] = sc.fit_transform(Z)

pca = PCA(svd_solver = "full")
reduced_dataset.iloc[:,3:7].isnull().sum()
reduced_dataset.drop(["county_connections_ratio"],axis=1,inplace=True)
reduced_dataset

In [None]:
reduced_dataset.drop(["pp_total_raw"],axis=1,inplace=True)
X_pca = pca.fit_transform(reduced_dataset.iloc[:,3:6])

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import SilhouetteVisualizer


def plot_silhouette_inertie(df, k_range):
    from sklearn.cluster import KMeans
    from sklearn import metrics
    import matplotlib.pyplot as plt
    
    inertia = []
    res = []
    
    for k in k_range:
        model = KMeans(n_clusters=k, n_init=20).fit(df)
        res.append(metrics.silhouette_score(df, model.labels_))
        inertia.append(model.inertia_)

    #plot clusters vs. silhouette score 
    plt.grid()
    plt.plot(k_range, res)
    plt.title("Silhouette")
    plt.xlabel("Number of clusters")
    plt.ylabel("Coefficient de silhouette")
    plt.tight_layout()
    plt.show() 
    #plot clusters vs. Inertial score 
    plt.grid()
    plt.plot(k_range, inertia)
    plt.title("Inertia")
    plt.xlabel("Number of clusters")
    plt.ylabel("Coefficient of inertia")
    plt.tight_layout()
    plt.show() 
    
    
fig, ax = plt.subplots(3, 2, figsize=(15,8))
for i in [2, 3, 4, 5, 6, 7]:
    km = KMeans(n_clusters=i, n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)

    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(X_pca)

There are some clusters which are made. However, they do not seem to be well formed

In [None]:
#librairies pour la CAH
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

reduced_dataset.index = reduced_dataset["pct_free/reduced"].map(lambda x:np.round(x,2))

In [None]:

#générer la matrice des liens
df0 = reduced_dataset.copy()
df0.index = df0["locale"]
Z = linkage(X_pca,method='ward',metric='euclidean')
fig,ax = plt.subplots(1,2,figsize=(30, 30))
# Create dendograms
dendrogram(Z,labels=reduced_dataset.index,orientation='left',color_threshold= 4,leaf_font_size=12,ax=ax[0])
dendrogram(Z,labels=df0.index,orientation='right',color_threshold= 4,leaf_font_size=12,ax=ax[1])
plt.title("Hierarchical clusters illustrating pct_free/reduced by Sector type")
plt.show()


We note that there are some characteristics which regroup different sectors. The grouping seem to happen by pct_free/reduced feature, with a distinct cluster of suburbs receiving lower pct_free/reduced, than most city and rural areas.  

# Conclusions
1. Some major companies in the digital world have been very active providing educational services at school.
2. Most of the resources were aimed at the PreK-12 learner groups
3. Programs aimed at facilitating online access, is not evenly distributed.
4. Most suburb groups benefit least from these programs


reduced_dataset

district.isnull().all(axis=0).sum()