In [None]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from statistics import mean

import operator

import os

## Data distribution check

In [None]:
# Let's check the mean and median size of the data in the engagement data folder

_files = []
_lenght_data = []
_lenght_cols = []

for _file in os.listdir('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    _files.append(int(_file[:-4]))
    df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/' + _file)
    _lenght_data.append(len(df))
    _lenght_cols.append(len(df.columns))

_infos = pd.DataFrame(data=zip(_files, _lenght_data, _lenght_cols), columns=['district_id', 'Lines', 'Cols'])

print(f"""Files length median is : {int(_infos['Lines'].median())}
Mean is : {int(_infos['Lines'].mean())}
Shorter file has {int(_infos['Lines'].min())} lines
and the longest one {int(_infos['Lines'].max())}.
There are {int(_infos['Cols'].unique())} columns on each file""")

### So where does the data come from exactly ?

In [None]:
# Let's merge our data infos on their District ID and pivot the data to see which state and locale provided the most infos

district_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')

merged = pd.merge(_infos, district_df, on="district_id")
_pvt = merged.pivot_table(index='state', columns='locale', values='Lines', aggfunc='sum')

display(pd.DataFrame(data=(_pvt.isna().sum())).reset_index().sort_values(by=0).rename(columns={0 : 'Missing data'}))

pvt = merged.pivot_table(index=['state', 'locale'], values='Lines', aggfunc='sum').reset_index()

fig = px.bar(pvt, x="state", y="Lines", color="locale")

fig.update_layout(margin=dict(
                              l=20,
                              r=20,
                              b=50,
                              t=50,
                              pad=10
                            ),
                            paper_bgcolor="darkgray",
                            title="Data distribution by locale type and State", 
                            title_x=0.5
                        )

fig.update_xaxes(title="")
fig.update_yaxes(title="")

fig.show()

### Suburb > city > rural > town. I would have liked more data coming from rural / town areas seeing they were the most impacted by travel restrictions / school closures, the internet connexion could also be a problem in some areas. Unfortunately the column 'country_connections_ratio' which should be helping us has poor quality data in there (a giant interval) and is of no use to us.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 10))
fig.suptitle('Data distribution by state')

sns.set_style(style = "darkgrid")

district_per_state = district_df[['state', 'district_id']].groupby(by='state').count().reset_index()
district_per_state.sort_values(by='district_id', ascending=False, inplace=True)

g1 = sns.barplot(ax=axes[0], data=district_per_state, x='district_id', y='state')
g1.set(title='Number of School districts available per State')
g1.set(xlabel='Nb of school districts')
g1.set(ylabel=None)

_pvt2 = merged.pivot_table(index='state', values='Lines', aggfunc='sum')
_pvt2 = _pvt2.sort_values(by="Lines", ascending=False)

g2 = sns.barplot(ax=axes[1], data=_pvt2.reset_index(), x='Lines', y='state')
g2.set(title='Available Data points per State')
g2.set(xlabel='Data points')
g2.set(ylabel=None)
xlabels = ['{:,.0f}'.format(x) for x in g2.get_xticks()]
g2.set_xticklabels(xlabels)

plt.show()

### The distribution of the data by State is very unequal, that was expected but the disparities are big. We are missing a lot of states and the most populated states are not the ones who provided the most data, definitely something to take into account. 


### Let's now take care of the columns with aggregated data in it ( 'sectors' and 'primary essential function'). I'll split it between 'primary essential function' and 'sub primary essential function'. For the sectors i'll get some dummies.

In [None]:
# The sectors are grouped, we need to split them, get_dummies is probably the best way to go about this
# We'll also split the 'Primary Essential Function' column into main and sub category as per the Readme file infos
# For that Nans need to be dropped, there are a total of 20 out of 372, which is not too bad

_products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

# Thanks Leonie for the great code !
temp_sectors = _products['Sector(s)'].str.get_dummies(sep="; ")
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
_products = _products.join(temp_sectors)

_products['main pef'] = _products['Primary Essential Function'].dropna().apply(lambda x : x.split(' - ')[0].strip())
_products['sub pef'] = _products['Primary Essential Function'].dropna().apply(lambda x : x.split(' - ')[1:])
_products['sub pef'] = _products['sub pef'].dropna().apply(lambda x : ", ".join(x))

_products.drop(["Sector(s)", "Primary Essential Function"], axis=1, inplace=True)

_products

## Popular products

In [None]:
# I'll manipulate the _products table to get some graphs out of it

_sectors_data = {
         'sector_Corporate' : [_products['sector_Corporate'].sum()], 
         'sector_HigherEd' : [_products['sector_HigherEd'].sum()],
         'sector_PreK-12' : [_products['sector_PreK-12'].sum()]
         }

_sectors = pd.DataFrame.from_dict(data=_sectors_data, orient='index').rename(columns={0 : 'count'})

_pvt3 = _products.pivot_table(index='Provider/Company Name', values='LP ID', aggfunc='count').sort_values(by="LP ID", ascending=False).reset_index()

l=[]
for i in _products['sub pef'].dropna():
    l.append(i)

_res={}
for i in list(set(l)):
    _dico = {i : l.count(i)}
    _res.update(_dico)

_sub_pef = pd.DataFrame.from_dict(data=_res, orient='index', columns=['count']).sort_values(by='count', ascending=False).reset_index()
_sub_pef.rename(columns={'index' : 'sub'}, inplace=True)


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 10))
fig.suptitle('Distribution')

sns.set_style(style = "darkgrid")
plt.figure(figsize=(8, 6))

g1 = sns.barplot(ax=axes[0, 1], data=_sectors.sort_values(by='count', ascending=False), x='count', y=_sectors.index)
g1.set(title='Sector Distribution')
g1.set(xlabel=None)
g1.set(ylabel=None)

g2 = sns.countplot(ax=axes[1, 1], data=_products, y='main pef')
g2.set(title='Main Essential Function')
g2.set(xlabel=None)
g2.set(ylabel=None)

g3 = sns.barplot(ax=axes[0, 0], data=_pvt3.head(5), x='LP ID' , y='Provider/Company Name')
g3.set(title='Company')
g3.set(xlabel=None)
g3.set(ylabel=None)

g4 = sns.barplot(ax=axes[1, 0], data=_sub_pef.head(5), x='count' , y='sub')
g4.set(title='Other Essential Function')
g4.set(xlabel=None)
g4.set(ylabel=None)

plt.show()

### Some interesting insights there. 'Corporate education' is far ahead of everyone else (so not exactly your typical school student), 'Learning & Curriculum' dominates all over kinds of Primary Essential Function. There are plenty of other essential functions in the newly created 'sub pef' column but 'Sites, Resources & Reference' seems to be the winner. As far as companies go no surprise there with Google and Microsoft at the top of the chart.

### I will now take care of the columns "pct_black/hispanic", "pp_total_raw" and "pct_free/reduced", values are represented with intervals and i'll turn them into an average for better readability, the column "county_connections_ratio" has poor quality data in it doing an average of those intervals wouldn't make much sense

In [None]:
for i in range(len(district_df)):
    _val_race = district_df["pct_black/hispanic"][i]
    _val_free = district_df["pct_free/reduced"][i]
    _val_pp = district_df["pp_total_raw"][i]
    try:
        _val_race = _val_race[1:-1].split(', ')
        district_df["pct_black/hispanic"][i] = int(mean(float(i) for i in _val_race) * 100)
    except:
        pass
    try:
        _val_free = _val_free[1:-1].split(', ')
        district_df["pct_free/reduced"][i] = int(mean(float(i) for i in _val_free) * 100)
    except:
        pass
    try:
        _val_pp = _val_pp[1:-1].split(', ')
        district_df["pp_total_raw"][i] = mean(int(i) for i in _val_pp)
    except:
        pass


district_df["pct_black/hispanic"] = district_df["pct_black/hispanic"].astype(float)
district_df["pct_free/reduced"] = district_df["pct_free/reduced"].astype(float) 
district_df["pp_total_raw"] = district_df["pp_total_raw"].dropna().astype(int)

print(f"Highest pct_black/hispanic we have is {district_df['pct_black/hispanic'].max()}")

display(district_df[district_df['pct_black/hispanic'] == district_df['pct_black/hispanic'].max()])



In [None]:
district_corr = district_df.corr()

district_corr.style.background_gradient(cmap='coolwarm').set_precision(2)

### 0.66 correlation score between Black / Hispanic districts and students available for free / discounted meals, that makes me sad.

## Data quality check

### I went to the NCES website and downloaded some data to double-check the porportion of Afro-american and Hispanic students in different states. The data can be found at this [link](https://nces.ed.gov/ccd/elsi/tableGenerator.aspx).

### I'll compare it to the data we are provided with

In [None]:
race_df = pd.read_csv('../input/elsi-csv-export/ELSI_csv_export_6376565703827256183522.csv', encoding='cp1252')

race_df.rename(columns={'State Name [Public School] Latest available year' : 'state',
'Total Students All Grades (Excludes AE) [Public School] 2019-20' : 'total',
'Hispanic Students [Public School] 2019-20' : 'hispanic',
'Black or African American Students [Public School] 2019-20' : 'black'}, inplace=True)

race_df = race_df.dropna(subset=['state'])

#race_df['state'] = race_df['state'].apply(lambda x : x.lower())
race_df['state'] = race_df['state'].apply(lambda x : " ".join(e.capitalize() for e in x.split()))
race_df['state'] = race_df['state'].apply(lambda x : x.strip())

_cols = ['total', 'state', 'hispanic', 'black']

race_df = race_df[_cols]

def col_clean(col):
    for _val in range(len(col)):
        for i in col[_val]:
            if i.isnumeric()==False:
                col[_val] = col[_val].replace(i, '')

    col = col.replace('', 0)
    col = col.astype(int)
    
    return col

race_df['hispanic'] = col_clean(race_df['hispanic'])
race_df['black'] = col_clean(race_df['black'])
race_df['total'] = col_clean(race_df['total'])

race_df = pd.DataFrame(race_df)

race_df = race_df.groupby(by='state').sum().reset_index()

race_df['% hispanic'] = round(race_df['hispanic']*100 / race_df['total'], ndigits=2)
race_df['% black'] = round(race_df['black']*100 / race_df['total'], ndigits=2)

race_df['% black/hispanic'] = race_df['% hispanic'] + race_df['% black']

district_dff = district_df.dropna(subset=['state'])[['state', 'pct_black/hispanic', 'pct_free/reduced']]
district_dff = district_dff.groupby(by='state').median()
district_dff.reset_index(inplace=True)
district_dff['state'] = district_dff['state'].apply(lambda x : " ".join(e.capitalize() for e in x.split()))

joined = pd.merge(district_dff, race_df, how="left", on=["state", "state"])

joined = joined[['state', 'pct_black/hispanic', '% black/hispanic']]

joined['difference'] = joined['% black/hispanic'] - joined['pct_black/hispanic']

joined.rename(columns={'pct_black/hispanic' : 'provided % black/hispanic', '% black/hispanic' : 'NCES % black/hispanic'}
              , inplace=True)

joined

### Ok we seem to have some discrepancies here, i'll choose to go with the data we are provided with

### Let's put those infos on maps

#### In order to do a Plotly Choropleth map we need to change a coupe of infos, like abbreviating the States names, i found a Python dictionary that will save us some time right [here](https://gist.github.com/rogerallen/1583593)

#### Infos on how to do a Plotly Choropleth map can be found [here](https://plotly.com/python/choropleth-maps/#choropleth-maps-with-gochoropleth)

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

state_map = district_df.dropna(subset=['state'])[['state', 'pct_black/hispanic', 'pct_free/reduced', 'pp_total_raw']]
state_map = state_map.groupby(by='state').mean().reset_index()
state_map['state_abrv'] = state_map['state'].apply(lambda x : us_state_abbrev[x])


fig = go.Figure()
layout = dict(
    title_text = f"Mean pct_black/hispanic per State",
    geo_scope='usa',
)

fig.add_trace(
    go.Choropleth(
        locations=state_map["state_abrv"],
        zmax=1,
        z = state_map["pct_black/hispanic"],
        locationmode = 'USA-states', # set of locations match entries in `locations`
        marker_line_color='white',
        geo='geo',
        colorscale=px.colors.sequential.Teal, 
    )
)

fig.update_layout(layout)   
#fig.show(renderer='notebook')

fig2 = go.Figure()
layout = dict(
    title_text = f"Mean pct_free/reduced per State",
    geo_scope='usa',
)

fig2.add_trace(
    go.Choropleth(
        locations=state_map["state_abrv"],
        zmax=1,
        z = state_map["pct_free/reduced"],
        locationmode = 'USA-states', # set of locations match entries in `locations`
        marker_line_color='white',
        geo='geo',
        colorscale=px.colors.sequential.Teal, 
    )
)

fig2.update_layout(layout)  

display(fig, fig2)


### The correlation between those 2 columns is pretty clear

### Ok let's take a look at the products, which ones are the most popular ? 
### Let's create a dictionary which will have as keys the ID of all available products and as values the number of times that product is mentionned in the engagement data files.


In [None]:
# List of products on the products_info file
all_products_id = _products['LP ID'].to_list()

# Initiate empty dictionary with the product ID as keys, all values set to 0 
products_hits_dic = {k : 0 for k in all_products_id}

# For each file in the engagement_data folder
for _file in os.listdir('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'):
    # Open the file, drop the NaNs from the 'lp_id' column
    data = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/' + _file)
    data = data.dropna(subset=['lp_id'])
    data['lp_id'] = data['lp_id'].astype(int)
    # Create a 'hit' column and do the sum ( i avoid potential NaNs that i might encounter if i did a .count() rather than .sum() )
    # I could have dropped the NaNs but i might have missed too many lines by doing that
    data["hit"] = 1
    grouped_data = data.groupby(by='lp_id').sum().reset_index()
    # List the values and create a dictionary with them
    _ids = grouped_data['lp_id'].to_list()
    _hits = grouped_data['hit'].to_list()

    _temp_dic = {_id : _hit for _id, _hit in zip(_ids, _hits)}
    # Now for each key present in my dictionary that's also present in the products_hits_dic, i increment the value
    for key, value in _temp_dic.items():
        if key in products_hits_dic.keys():
            products_hits_dic[key] += value
 


In [None]:
# I'm gonna create a Dataframe with the values of the dictionary and merge the results with the products_info.csv file

_top_products = pd.DataFrame.from_dict(products_hits_dic, orient='index').reset_index()
_top_products.rename(columns={'index' : 'LP ID', 0 : 'hits'}, inplace=True)
_top_products.sort_values(by='hits', ascending=False, inplace=True)

_top_products_merged = pd.merge(_top_products, _products, on="LP ID")
_top_products_merged

In [None]:
fig = px.histogram(_top_products_merged.head(30), x="hits", y="Product Name", height=900)

fig.update_layout(paper_bgcolor="darkgray", title="Most popular Learning tools", title_x=0.5)

fig.update_xaxes(title="")
fig.update_yaxes(title="", autorange='reversed')
fig.update_traces(hovertemplate='<i>Total %{y} : </i>' + '%{x}')

fig.show()

## Engagement of students

### Time to look at the engagement of the students for different Essential Functions items
#### Remember how we split the 'Primary Essential Function' column into 2 sub-categories? that's what we'll use here

In [None]:
def nice_graph(category : str, _nb : int):

    _category = _products[_products['sub pef'] == str(category)]
    _category_lp_id = _category['LP ID'].astype(int).to_list()
    _category_products = _category['Product Name'].to_list()
    _category_dic = {a : b for a, b in zip(_category_lp_id, _category_products)}

    _category_data = pd.DataFrame(columns={'time', 'lp_id', 'pct_access', 'engagement_index'})

    for _file in os.listdir('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'):
        
        df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/' + _file)
        df = df[df['lp_id'].isin(_category_lp_id)]

        _category_data = pd.concat([_category_data, df], ignore_index=True)

    _category_data.dropna(inplace=True)
    _category_data['time'] = pd.to_datetime(_category_data['time'])
    _category_data = _category_data.sort_values(by='time', ascending=True).reset_index(drop=True)
    _category_data['lp_id'] = _category_data['lp_id'].astype(int)
    _category_data['product name'] = _category_data['lp_id'].apply(lambda x : _category_dic[x])
    _category_data['engagement_index'] = _category_data['engagement_index'].apply(lambda x: int(x))

    _pvt = _category_data[['product name', 'lp_id']].pivot_table(index='product name', values='lp_id', aggfunc='count').sort_values(by='lp_id', ascending=False).reset_index()
    _pvt.rename(columns={'product name' : category, 'lp_id' : 'hits'}, inplace=True)

    _top_products = _pvt[category][:_nb].to_list()

    _pvt_fig = px.histogram(_pvt.head(_nb), x="hits", y=category)
    _pvt_fig.update_layout(paper_bgcolor="darkgray", title=category, title_x=0.5)
    _pvt_fig.update_xaxes(title="")
    _pvt_fig.update_yaxes(title="", autorange='reversed')
    _pvt_fig.update_traces(hovertemplate='<i>Total %{y} : </i>' + '%{x:.0f}')
    
    _category_fig = px.line(_category_data[_category_data['product name'].isin(_top_products)], x="time", y="engagement_index", color='product name')
    _category_fig.update_layout(title=f'{category} engagement')
    _category_fig.update_xaxes(title='')
    _category_fig.update_yaxes(title='Engagement Index')
    _category_fig.update_traces(hovertemplate=
                                              '<br><b>Total</b>: %{y:.0f}<br>'
                                              + '<br><b>Date</b>: %{x}<br>'
                                              )
    
    return _pvt_fig, _category_fig
    

### These are the top 'Primary Essential Function' items in the 'sub_pef' column

In [None]:
_sub_pef.head(5)

In [None]:
study_tools_ranking, study_tools_graph = nice_graph('Study Tools', 5)
virtual_classromm_ranking, virtual_classromm_graph = nice_graph('Virtual Classroom, Video Conferencing & Screen Sharing', 5)
c_c_c_ranking, c_c_c_graph = nice_graph('Content Creation & Curation', 5)
s_r_r_ranking, s_r_r_graph = nice_graph('Sites, Resources & Reference', 5)
d_l_p_ranking, d_l_p_graph = nice_graph('Digital Learning Platforms', 5)

### I chose to only display the top 5 products in each category to spare our CPUs, but if you want to see more just type in the number of products you'd like displayed as the second parameter of the 'nice_graph' function in the cell above

### If you're interested in looking at the engagement for one particular product Double-click on its name in the 'product name' legend, add as many as you want if you want to compare them.

In [None]:
display(study_tools_ranking, study_tools_graph) 

In [None]:
display(virtual_classromm_ranking, virtual_classromm_graph)

In [None]:
display(c_c_c_ranking, c_c_c_graph)

In [None]:
display(s_r_r_ranking, s_r_r_graph)

In [None]:
display(d_l_p_ranking, d_l_p_graph) 