In [None]:

# ignnore warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import plotly.express as px
import gc

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory




# Product information data

The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020.

In [None]:
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

In [None]:
products_df.head().T

In [None]:
#basic insight on our columns and understand their properties and datatypes
products_df.info()

In [None]:
products_df.isna().any()

In [None]:
products_df.shape

# District information data

The district file districts_info.csv includes information about the characteristics of school districts.

In [None]:
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

In [None]:
#basic insight on our columns and understand their properties and datatypes
districts_df.info()

In [None]:
districts_df.isna().any()

In [None]:
districts_df.shape

# Engagement data

The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

# Visualize Missing values

In [None]:
msno.bar(products_df,color='skyblue', sort="ascending", figsize=(12,6), fontsize=12)
plt.show()

In [None]:
msno.bar(districts_df,color='#4895ef', sort="ascending", figsize=(12,6), fontsize=12)
plt.show()

In [None]:
msno.bar(engagement_df,color='blue', sort="ascending", figsize=(12,6), fontsize=12)
plt.show()

# Exploratory Data Analysis

In [None]:
pal1 = ["#fec5bb","#fcd5ce","#fae1dd","#f8edeb","#e8e8e4","#d8e2dc","#ece4db","#ffe5d9","#ffd7ba","#fec89a"]
pal2 = ["#ffcbf2","#f3c4fb","#ecbcfd","#e5b3fe","#e2afff","#deaaff","#d8bbff","#d0d1ff","#c8e7ff","#c0fdff"]
pal3 = ["#d6d2d2","#f1e4f3","#f4bbd3","#f686bd","#fe5d9f"]
pal4 = ["#a09abc","#b6a6ca","#d5cfe1","#e1dee9","#d4bebe"]

In [None]:
#group by state
plt.figure(figsize=(16, 10))
sns.countplot(y="state",data=districts_df,order=districts_df.state.value_counts().index,palette="Blues",linewidth=3)
plt.title("The number of Districts group by state",font="Serif", size=20)
plt.show()

In [None]:
#group by locale
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('The number of Districts group by locale', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05, 0.05)
labels = list(districts_df.locale.value_counts().index)
sizes = districts_df.locale.value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#d45d00","#ff9100","#eaaa00","#6d6875"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.countplot(y='Provider/Company Name', data=products_df, order=products_df["Provider/Company Name"].value_counts().index[:10],palette = pal2)
plt.title("Top 10 Provider/Company Names",font="Serif", size=20)
plt.show()

In [None]:
c1=c2=c3=0
for s in products_df["Sector(s)"]:
    if(not pd.isnull(s)):
        s = s.split(";")
        for i in range(len(s)):
            sub = s[i].strip()
            if(sub == 'PreK-12'): c1+=1
            if(sub == 'Higher Ed'): c2+=1
            if(sub == 'Corporate'): c3+=1

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [c1,c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#ff228a","#20b1fd","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
primary_essential_main = []
primary_essential_sub = []
for s in products_df["Primary Essential Function"]:
    if(not pd.isnull(s)):
        s1 = s.split("-",1)[0].strip()
        primary_essential_main.append(s1)
    else:
        primary_essential_main.append(np.nan)
    
    if(not pd.isnull(s)):
        s2 = s.split("-",1)[1].strip()
        primary_essential_sub.append(s2)
    else:
        primary_essential_sub.append(np.nan)

products_df["primary_essential_main"] = primary_essential_main
products_df["primary_essential_sub"] = primary_essential_sub

In [None]:
c1=c2=c3=0

for s in products_df["primary_essential_main"]:
    if(not pd.isnull(s)):
        c1 += s.count("CM")
        c2 += s.count("LC")
        c3 += s.count("SDO")

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Primary Essential Function', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['CM','LC','SDO']
sizes = [c1, c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#18ff9f","#2cfbff","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
sns.countplot(y='primary_essential_sub', data=products_df, order=products_df["primary_essential_sub"].value_counts().index,color = pal2[6])
plt.title("Primary Essential Function(Sub)",font="Serif", size=20)
plt.show()

# Analyse the data


 **Analyse the distribution of locale in the district dataset**

In [None]:
locale_data=districts_df.groupby('locale')['district_id'].count().reset_index(name='totalcount')
fig = px.bar(locale_data, x='locale', y='totalcount')
fig.show()

In [None]:
districts_df

**Analyse the distribution of ethinicity per state**

In [None]:
ethinicity_data=districts_df.groupby(['state','pct_black/hispanic'])['district_id'].count().reset_index(name='totalcount')
fig = px.bar(ethinicity_data, x='state', y='totalcount', color='pct_black/hispanic')
fig.show()

**Analyse the distribution of free/reduced lunch per state**

In [None]:
lunch_data=districts_df.groupby(['state','pct_free/reduced'])['district_id'].count().reset_index(name='totalcount')
fig = px.bar(lunch_data, x='state', y='totalcount', color='pct_free/reduced')
fig.show()

**Analyse the distribution of high speed lunch per state**

In [None]:
connection_data=districts_df.groupby(['state','county_connections_ratio'])['district_id'].count().reset_index(name='totalcount')
fig = px.bar(connection_data, x='state', y='totalcount', color='county_connections_ratio')
fig.show()

**What the most used educational product across all the given districts as a function of time**


In [None]:
lp_id_performance=engagement_df[(~(engagement_df['pct_access'].isnull())
                                &(~(engagement_df['lp_id'].isnull())))].groupby(['time','lp_id'])['pct_access'].mean()
lp_id_performance=lp_id_performance.reset_index(name='average_access')
lp_id_performance['lp_id']=lp_id_performance['lp_id'].astype(int)

In [None]:
ww=lp_id_performance['lp_id'].unique().tolist()
wd=products_df['LP ID'].unique().tolist()
print("Products that are not present in the product df description",len(list(set(ww).difference(wd))))
print("Total no of distinct products",lp_id_performance['lp_id'].nunique())


There are about 8277 product that are not present in the description. Below I have tried to evaluate the average performance

In [None]:
topproducts=lp_id_performance.groupby('lp_id')['average_access'].mean().reset_index(name='average_access')
topproducts=pd.merge(topproducts,products_df, how='left',
                                 left_on='lp_id', right_on=['LP ID'])
topproducts=topproducts[~(topproducts['Product Name'].isnull())]
topproductslist=topproducts.sort_values('average_access', ascending=False).head(10)['lp_id'].tolist()

In [None]:
lp_id_performance_filter=lp_id_performance[lp_id_performance['lp_id'].isin(topproductslist)]
lp_id_performance_filter=pd.merge(lp_id_performance_filter,products_df, how='left',
                                 left_on='lp_id', right_on=['LP ID'])
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
fig = px.line(lp_id_performance_filter, x="time", y="average_access", color="Product Name")
fig.update_layout(
    title_text="Average access for top performing educational products across timeline",
)
fig.update_xaxes(title_text="Month-Year")
fig.update_yaxes(title_text="Average access")
fig.show()

This is based on products data.I have excluded all the products that doesnt have a product name associated with them. From the graph we can see that google classroom and google docs where the mostly used products throughout the year. The dip in Jul and August is due to the school holidays.

 **Which educational products are popular per state across timeline given?**

In [None]:
engagement_df['district_id']=engagement_df['district_id'].astype('int64')
engagement_district_df=pd.merge(engagement_df, districts_df, how='left')

In [None]:
lp_id_performance=engagement_district_df[(~(engagement_district_df['pct_access'].isnull())
                                &(~(engagement_district_df['lp_id'].isnull())))].groupby(['time','state','lp_id'])['pct_access'].mean()
lp_id_performance=lp_id_performance.reset_index(name='average_access')
lp_id_performance['lp_id']=lp_id_performance['lp_id'].astype(int)
gc.collect()

In [None]:
topproducts=lp_id_performance.groupby(['lp_id','state'])['average_access'].mean().reset_index(name='average_access')
topproducts=pd.merge(topproducts,products_df, how='left',
                                 left_on='lp_id', right_on=['LP ID'])
topproducts=topproducts[~(topproducts['Product Name'].isnull())]

In [None]:
top_products_state=topproducts.groupby('state').apply(lambda x : x.sort_values(by = 'average_access', ascending = False).head(5).reset_index(drop = True))

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
for state in top_products_state['state'].unique().tolist():
    state_list=lp_id_performance[(lp_id_performance['state']==state)&
                                (lp_id_performance['lp_id'].isin(
                                top_products_state[top_products_state['state']==state]['lp_id']
                                ))]
    state_list=pd.merge(state_list,products_df, how='left',
                                 left_on='lp_id',right_on='LP ID')
   
    fig = go.Figure()
    color=['#636EFA', '#EF553B',
           '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']
    for i,lp_id in enumerate(state_list['lp_id'].unique().tolist()):
        fig.add_trace(go.Scatter(x=state_list[state_list['lp_id'] ==lp_id]['time'],
                                 y=state_list[state_list['lp_id'] ==lp_id]['average_access'], 
                                 name=state+'_'+str(state_list[state_list['lp_id'] ==lp_id]['Product Name'].iloc[0]),
                                 line=dict(color=color[i], width=2)))

    fig.update_layout(title='Top products by usage in '+ state,
                   xaxis_title='Month',
                   yaxis_title='Average product Access')
    fig.show()

    gc.collect()

**Which educational sectors are prominent per state**

In [None]:
product_engagement_merge=pd.merge(engagement_df,products_df, how='left',
                                 left_on='lp_id', right_on=['LP ID'])
product_engagement_merge=product_engagement_merge[~(product_engagement_merge['Sector(s)'].isnull())]
product_engagement_merge['district_id']=product_engagement_merge['district_id'].astype('int64')
product_state_data=pd.merge(product_engagement_merge,districts_df, how='left')
gc.collect()

In [None]:
product_state_data_percentage=(product_state_data.groupby('state')['Sector(s)'].value_counts()/\
product_state_data.groupby('state')['lp_id'].count()).reset_index(name='percentage_split')


In [None]:
fig = px.bar(product_state_data_percentage, x='state', y='percentage_split', color='Sector(s)')
fig.show()

 Arizon and North Dakota uses a lot of educational products that fall under a combination of PreK-12; Higher Ed; Corporate