In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Product information data loading

In [None]:
os.chdir("/kaggle/input/learnplatform-covid19-impact-on-digital-learning")
df_districts = pd.read_csv("districts_info.csv")

df_products = pd.read_csv("products_info.csv")


In [None]:
list(df_products)

In [None]:
df_products.dtypes

In [None]:
# Univariate analysis
product_cols = ["Sector(s)", "Primary Essential Function", "Provider/Company Name"]

In [None]:
# Number of unique features for each feature
unique = [len(df_products.groupby(c)[c].unique()) 
    for c in product_cols]

plt.figure(figsize=(15,6))
bar = sns.barplot(x=unique, y=product_cols, palette="YlGn")
plt.bar_label(bar.containers[0], padding=5)
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=3)

for idx, c in enumerate(product_cols):
    df_products.groupby(c)[c].count().sort_values(ascending=False)[:10].plot.barh(ax=ax[idx], figsize=(10, 30), color='Yellow')

# District info


In [None]:
df_districts.info()

In [None]:
# Univariate analysis

district_cols = ["locale", "pct_black/hispanic", "pct_free/reduced", "county_connections_ratio", "pp_total_raw"]

fig, ax = plt.subplots(nrows=len(district_cols), ncols=1, figsize=(15, 25))

for i in range(len(district_cols)):
    df = df_districts[district_cols[i]].value_counts().sort_values(ascending=True)
    sns.barplot(x=df.values, y=df.index, palette="coolwarm", ax=ax[i])
    ax[i].set_title(district_cols[i])
    
fig.tight_layout(pad=3)

Most of the school districts are in suburb areas. Suburban living is disadvantageous because it takes longer to reach most destinations. Given the chance to attend virtually should be advantageous. I wonder if students from suburbans were the most accessing the learning platforms. 



columns - pct_black/hispanic, pct_free/reduced, county_connections_ratio and pp_total_raw

]a,b[ = { x∈R : a<x<b}

[a,b[ = {x∈R:a≤x<b}

all values are in this format ex: [0.2, 0.4[ -> 0.2<= x < 0.4

]a,b] = {x∈R:a<x≤b}

In [None]:
# transformation of pct_black/hispanic and pct_free/reduced
distcrict_copy=df_districts.copy()
df_districts.dropna(inplace=True)
df_districts['pct_black/hispanic']=df_districts['pct_black/hispanic'].apply(lambda x :float(x.split(',')[0][1:])+0.1)

df_districts['pct_free/reduced']=df_districts['pct_free/reduced'].apply(lambda x :float(x.split(',')[0][1:])+0.1)
df_districts=df_districts.reset_index()
df_districts.drop(labels='index',inplace=True,axis=1)
df_districts

In [None]:
# transformation of county_connections_ratio and pp_total_raw
df_districts['pp_total_raw']=df_districts['pp_total_raw'].apply(lambda x :float(x.split(',')[0][1:])+1000)
df_districts['county_connections_ratio']=df_districts['county_connections_ratio'].apply(lambda x: float(x.split(',')[0][1:])+0.1)

df_districts

In [None]:
# Creating dataframe for correlation
df_corr = df_districts[['state', 'locale', 'pct_black/hispanic', 'pct_free/reduced', 'pp_total_raw']]
# Visualization of the correlation table
correlation = df_corr.corr()
plt.figure(figsize=(14,7))
sns.heatmap(correlation, linecolor='Yellow',linewidths=0.1, annot=True)
plt.title('Correlation Matrix', pad=11, size=17)
plt.xlabel('Digital Learning Data')
plt.ylabel('Digital Learning Data')
plt.show()

The correlation indicates that 65% of those studying for free is black or hispanic(related to spanish speaking countries)



## Drop the missing values

In [None]:
df_products.info()

In [None]:
# Null values for each column
df_products.isnull().sum()

In [None]:
df_products.dropna(subset=['Sector(s)'], inplace=True)

In [None]:
df_products['Sector(s)'].isnull().sum()

In [None]:
df_products.dropna(subset=['Provider/Company Name'], inplace=True)

In [None]:
df_products['Provider/Company Name'].isnull().sum()

In [None]:
df_products.dropna(subset=['Primary Essential Function'], inplace=True)

In [None]:
df_products['Primary Essential Function'].isnull().sum()

In [None]:
  # List unique products and produce a list
products = df_products['Product Name'].unique().tolist() 

In [None]:
products

In [None]:
# How many products being used by students
print(len(products))

In [None]:
# drop column called URL because we will not need it
df_products.drop(['URL'], axis = 1, inplace = True)

In [None]:
df_products

In [None]:
df_districts.head()

In [None]:
# checking missing values

print("percentage of missing values in DISTRICT DATA")
print('\n')
print(df_districts.isnull().sum()/len(df_districts)*100)
print('\n')
print("percentage of missing values in PRODUCT DATA")
print('\n')
print(df_products.isnull().sum()/len(df_products)*100)


In [None]:
# dropping rows with missing values
df_districts.dropna(subset= ['state'], axis=0, inplace= True)

# also dropping "pp_total_raw" column as it has ~50% null values
df_districts.drop(columns=["pp_total_raw"], axis=1, inplace= True)

## Engagement data loading

In [None]:
path = r'/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[5].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    

df_engagement = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df_engagement.rename(columns={"lp_id": "LP ID"}, inplace=True)
merged=pd.merge(df_engagement, df_products, on= "LP ID")
m=merged.groupby("Product Name")["pct_access"].mean().sort_values(ascending=False).head(10)

df_engagement.rename(columns={"lp_id": "LP ID"}, inplace=True)
merged=pd.merge(df_engagement, df_products, on= "LP ID")
n=merged.groupby("Product Name")["engagement_index"].sum().sort_values(ascending=False).head(10)

# plot
plt.figure(figsize=(15,4))

plt.subplot(121)
plt.bar(m.index, m.values, color=["#e54f6e","#282e54","#ccc6ee","#01786f","#407294","#bada55","#fe948e","#4ca3dd"])
plt.xlabel('Product Name')
plt.xticks(rotation=90)
plt.ylabel('Mean percentage of students')
plt.title("With atleast one-page load event")

plt.subplot(122)
plt.bar(n.index, n.values, color=["#0f5b9e", "#ad97ec", "#8bd6ba", "#bfacd6", "#cbe6e6", "#df7b6e"])
plt.xlabel('Product Name')
plt.xticks(rotation=90)
plt.ylabel('Page-load per 1000 students')
plt.title("With number of page-load per 1000 students")

In [None]:
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly
import plotly.graph_objects as go
import plotly.express as px

class_doc=merged[(merged["Product Name"]=="Google Classroom")|(merged["Product Name"]=="Google Docs")]
pct=class_doc.groupby(["time", "Product Name"])["pct_access"].mean().to_frame().reset_index()
eng=class_doc.groupby(["time", "Product Name"])["engagement_index"].sum().to_frame().reset_index()
# plot
fig = px.line(pct, x="time", y="pct_access", color='Product Name',title='Percentage of students with atleast one-page load event on a given day',
              template="ggplot2", width=800, height=400)
fig.show()

fig = px.line(eng, x="time", y="engagement_index",title='Sum of number of page-load per 1000 students on a given day', color='Product Name',
              template="seaborn", width=800, height=400)
fig.show()

We can see the highest peek in May because school is winding up for summer. Summer vacation or summer break is a school break in summer between school years and the break in the school academic year. Students are typically off between eight and nine weeks, but not staff, depending on the country and district. In the United States, summer break is approximately two and a half months, with students typically finishing the school year between late-May and late-June and starting the new year between early-August and early-September. 
That's why there has been a decline in the engagement index and  the access percentage between may and september. 

In [None]:
df_engagement

In [None]:
df_products["Basic function"]= df_products["Primary Essential Function"].str.extract('(LC|CM|SDO)')
function_dict ={'LC' : 'Learning & Curriculum', 'CM' : 'Classroom Management', 'SDO' : 'School & District Operations'}
df_products['Basic_function_full_form'] = df_products["Basic function"].map(function_dict)
df_products["Sub Basic function"]= df_products["Primary Essential Function"].str.lstrip('(LCCMSDO')
df_products = df_products.replace('-', '', regex=True)

colors = ['#a47053', '#efca66', '#ecdab9']
fig = go.Figure(data=[go.Pie(labels=df_products["Basic_function_full_form"].value_counts().index, values=df_products["Basic_function_full_form"].value_counts().values)])
fig.update_traces(hoverinfo='label+value', textinfo='label+percent', textfont_size=12,
                  marker=dict(colors=colors, line=dict(color='#cec3c8', width=2)))
fig.show()

In [None]:
df_engagement['district_id'].head()

In [None]:
df_engagement["time"] = pd.to_datetime(df_engagement["time"])
df_engagement['day'] = df_engagement['time'].dt.day_name()
df_engagement['month'] = df_engagement['time'].dt.month_name()

In [None]:
df_engagement

In [None]:
df_engagement.describe()

In [None]:
df = pd.merge(df_products,df_engagement,left_on = 'LP ID',right_on = 'LP ID')
df.head()

In [None]:
df_districts['district_id'] = df_districts['district_id'].astype(str)
data = pd.merge(df,df_districts,left_on = 'district_id', right_on = 'district_id')

In [None]:
data.head()

In [None]:
def plot_bar(data:pd.DataFrame,col1:str,col2:str):
    df=data.sort_values(by=col1,ascending=False)
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    ax=sns.barplot(df.index,df[col1],palette='YlGn',dodge=False)
    ax.set_xticklabels(df.index,rotation=80)
    plt.title(f'Distribution of {col1}')
    
    plt.subplot(1,2,2)
    ax=sns.barplot(df.index,df[col2],palette='YlOrBr',dodge=False)
    ax.set_xticklabels(df.index,rotation=80)
    plt.title(f'Distribution of {col2}')
    plt.show()

In [None]:
months = data.groupby('month').agg({'pct_access':'mean','engagement_index':'mean'})
days = data.groupby('day').agg({'pct_access':'mean','engagement_index':'mean'})
plot_bar(months,'pct_access','engagement_index')
plot_bar(days,'pct_access','engagement_index')

In [None]:
print("Bellow are all missing values of engagement dataframe:")
df_engagement.isnull().sum()

In [None]:
df_engagement.duplicated().any()

In [None]:
# Deleting rows of missing values in these columns
df_engagement.dropna(subset=["engagement_index", "LP ID", "pct_access"], axis=0, inplace=True)
df_engagement.isnull().sum()

In [None]:
# Changing data type from 'object' to 'int64'
df_engagement["district_id"] = pd.to_numeric(df_engagement["district_id"])

In [None]:

#Distribution of Sector(s) 
products_sec=df_products['Sector(s)'].value_counts().reset_index()

products_sec.columns = ['Sector(s)','percent']

products_sec['percent'] /= len(df_products)
fig = px.pie(
    products_sec, 
    names='Sector(s)', 
    values='percent',
    color_discrete_sequence=px.colors.sequential.turbid,
    title='Distribution of Sector(s) in the Product Information Data:', 
    width=700,
    height=500
)
fig.show()