In [None]:
#Import libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Rectangle
import matplotlib as mpl

import geopandas

%matplotlib inline

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h1> COVID-19 Imact on Distance Learning</h1>
    <h2> Introduction </h2>
    <p>Due to the COVID-19 many schools across the world were shut, leaving children and students are out of their classrooms.
As a result, education has changed drastically to e-learning, whereby teaching is undertaken remotely and through digital platforms. Some research articles highlight that online learning proves to increase retention of information, and takes less time, meaning the changes might be here to stay. However, other research shows that the traditional education access is far from equitable and the switch to e-learning has only increased the imbalance.</p>
    <p> This analysis explores the state of digital learning in 2020 in United States and how the engagement of digital learning relates to factors such as district demographics, internet access, and state/national level policies and events.</p>
</span>

In [None]:
#Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Rectangle
import matplotlib as mpl
import geopandas

%matplotlib inline 

In [None]:
#Load data
districts = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv', keep_default_na=False) #keep_default_na=False to separate NaN from null
products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
engagement_data = pd.DataFrame()
for district in districts['district_id'].unique():
    temp_district = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/{}.csv'.format(district), parse_dates=[0])
    temp_district['district_id'] = district
    engagement_data = engagement_data.append(temp_district)
us_regions = pd.read_csv('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv')

#Historical county connection data from the same website as competition data https://www.fcc.gov/form-477-county-data-internet-access-services
states_sample = districts['state'].unique() #States present in study data
def get_internet(year):
    file = '../input/countyconnections/county_connections_dec_{}.csv'.format(year)
    internet = pd.read_csv(file)
    internet_filtered = internet[internet['statename'].isin(states_sample)]
    internet_filtered['ratio_bin'] = pd.cut(internet_filtered['ratio'],[0,0.18,1,2])
    return internet_filtered.groupby('ratio_bin')['countycode'].count()
internet_connection = pd.DataFrame()
for year in range(2016,2019):
    internet_connection[year] = get_internet(year)

In [None]:
#CLEAN DATA
#DISTRICTS
#Replace null values with 0
for column in districts.columns:
    districts[column] = districts[column].apply(lambda x: '0' if x == "" else x)  
#Replace range values
pct_conversion = {'[0, 0.2[':'0-20%', '[0.2, 0.4[':'20-40%', '[0.4, 0.6[':'40-60%', '[0.8, 1[':'80-100%',
                 '[0.6, 0.8[':'60-80%', '[0.18, 1[':'18-100%', '[1, 2[':'100-200%',
                 '[14000, 16000[':'14-16k', 'NaN':'NA', '[6000, 8000[':'6-8k', '[10000, 12000[':'10-12k',
                 '[8000, 10000[':'8-10k', '[12000, 14000[':'12-14k', '[16000, 18000[':'16-18k',
                 '[20000, 22000[':'20-22k', '[18000, 20000[':'18-20k', '[22000, 24000[':'22-24k',
                 '[4000, 6000[':'4-6k', '[32000, 34000[':'32-34k'}
districts['pct_black/hispanic'] = districts['pct_black/hispanic'].replace(pct_conversion)
districts['pct_free/reduced'] = districts['pct_free/reduced'].replace(pct_conversion)
districts['county_connections_ratio'] = districts['county_connections_ratio'].replace(pct_conversion)
districts['pp_total_raw'] = districts['pp_total_raw'].replace(pct_conversion)
#Add US regions to allow map plotting and possible aggreggating by regions
us_regions.rename(columns={'State': 'state'}, inplace=True)
districts = pd.merge(districts,us_regions, on='state', how='left')

#PRODUCTS
#Rename column to allow merge
products.rename(columns={'LP ID': 'lp_id'}, inplace=True)
#Split Primary Essential Function and rename
products[['Function Category','Primary Essential Function']] = products['Primary Essential Function'].str.split(pat=' - ', expand = True, n=1)
function_conversion = {'LC':'Learning & Curriculum', 'CM':'Classroom Management', 'SDO':'School & District Operations'}
products['Function Category'] = products['Function Category'].replace(function_conversion)

#ENGAGEMENT DATA
#Replace NaN with 0
engagement_data.fillna(0, inplace=True)
#Change product ip to integer to allign with products table
engagement_data['lp_id'] = engagement_data['lp_id'].astype('int64')
#Add day of week
engagement_data['dayofweek'] = engagement_data['time'].dt.dayofweek+1

#Merge all data in one table
all_data = pd.merge(pd.merge(engagement_data,districts,on='district_id',how='left'), products, on='lp_id', how='left')

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h2>Scope</h2>
    <p>The analysis is done based on information from <strong>233 districts in 23 states</strong>. Unfortunately, the available study data was not collected evenly from all the states, plus origin of the data is partially missing (see chart 1.1 and 1.2).<br>
        Connecticut and Utah are the most represented states, however they are not in the list of top 10 representative states in US.</p>
</span>

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,10))

#Aggregate distric information
districts_count = pd.merge(districts.groupby('state')['district_id'].count().sort_values(ascending=False), us_regions[['state','State Code']], on='state', how='left')

#Prepare table
normal = mpl.colors.Normalize(districts_count['district_id'].min()-1, districts_count['district_id'].max()+1)
colors = plt.cm.Purples(normal(districts_count['district_id']))
colors[0] = [0.5,0.5,0.5,1] #Change NaN to 0

#Plot table
the_table = ax1.table(
    cellText=districts_count[['state','district_id']].values,
    colLabels=['State','# districts'],
    cellLoc='center',
    loc='center', 
    cellColours=list(zip(colors,colors)),
    colWidths=[0.4, 0.3])
the_table.auto_set_font_size(False)
the_table.set_fontsize(12)
the_table.scale(1,1.7)
ax1.axis('off')
ax1.axis('tight')
ax1.set_title('1.1 States Included in Study Data', fontsize = 20, color='dimgrey')

#Preparing map
states = geopandas.read_file('../input/usshapefile/cb_2018_us_state_20m.shp')
states.rename(columns={'STUSPS':'State Code'}, inplace=True) #To allow merge
states = states.merge(districts_count[['State Code', 'district_id']], on='State Code', how='left') #Add district_id count
states = states[(states['NAME']!='Alaska') & (states['NAME']!='Hawaii') & (states['NAME']!='Puerto Rico')] #Exclude certain states

#Plot map
ax2 = states[states['district_id'].isna()].to_crs(epsg=5071).plot(
    color='white',
    ax=ax2,
    linewidth=0.5,
    edgecolor='lightgrey')
ax2 = states[states['district_id'].notna()].to_crs(epsg=5071).plot(
    column='district_id',
    cmap='Purples',
    linewidth=1,
    edgecolor='black',
    legend = False,
    vmax=57,
    ax=ax2)
ax2.axis('off')
ax2.set_title('1.2 Map of States Included in Study Data', 
             fontsize = 20,
             color='dimgrey')
plt.show()

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h2>The picture of digital connectivity in 2020</h2>
    <p>The essential factor to enable efficient online learning is access to internet. In the study data, the connectivity is reflected as percentage of private broadband connections over >200 kbs considering all households in the district.<br>
        The study data show, that in the majority of districts household are well connected independently from classified locale (city, suburb, town, rural) they lie in (see chart 2.1). This is also inline with the conclusions by the <a href="https://itif.org/publications/2020/07/13/lessons-pandemic-broadband-policy-after-covid-19">Information Technology and Innovation Foundation</a>, that although the internet traffic in 2020 increased, the network could accomodate this higher demand thanks to existing infrastructure.</p>
    <ul>Other findings related to internet connectivity:
        <li>In majority of the studied districts the percentage of '>200kbs connections' is between 18 to 100% </li>
        <li>The internet connectivity has not evolved much in recent years based on <a href="https://www.fcc.gov/form-477-county-data-internet-access-services">Federal Communications Commision data</a>(see chart 2.2)</li>
        <li>Most of the study data comes from suburb areas (see chart 2.3), which reflects the fact <a href="https://www.huduser.gov/portal/pdredge/pdr-edge-frm-asst-sec-080320.html">most Americans live in suburb neighbourhoods</a> </li>
        <li> There is no relation between household economic status and access to internet. Intuitively, we could expect lower income households to have worse broadband. The data shows, that even lower income households have the same connection as others (see chart 2.4). In this case the household income status is identified by students egligibility for free or reduced price lunch. We can see that even districts where majority of students are egligible for lunch subsidy have internet speed of >200kbs</li>
    </ul>
</span>

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(18,14))

#Prepare data for Internet connectivity
connection = districts.pivot_table(index='county_connections_ratio', columns='locale', values='district_id', aggfunc=np.count_nonzero, fill_value=0)[['City','Suburb','Town','Rural','NaN']]
labels = ['0-18%', '18%-100%', '>100%', 'No data']

x1 = np.arange(5)  # the label locations
width = 0.25  # the width of the bars

#Plot connectivity
ax1.bar(x1 + 0, connection.values[0], width=width, label=labels[0], color='rebeccapurple')
ax1.bar(x1 + width, connection.values[1], width=width, label=labels[1], color='mediumpurple')
ax1.bar(x1 + 2*width, connection.values[2], width=width, label=labels[2], color='magenta')
ax1.bar(x1 + 3*width, connection.values[3], width=width, label=labels[3], color='firebrick')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax1.set_ylabel('Number of school districts in sample data', 
             fontsize = 12,
             color='grey')
ax1.set_title('2.1 Households with Internet Connection Speed > 200kbs', 
             fontsize = 14,
             color='slategrey')
ax1.set_xticks(x1+0.25)
ax1.set_xticklabels(['City','Suburb','Town','Rural','NaN'], fontsize=14)
ax1.tick_params(colors='grey')
ax1.legend(labelcolor='grey', fontsize=14)
ax1.set_facecolor('ghostwhite')
ax1.spines[['top','right','bottom','left']].set_visible(False)
ax1.text(3,60, 'Most of households in districts\nhave broadband connection\nof >200kbs', fontsize=14, ha='center', color='red', backgroundcolor='white')


#Prepare data for connectivity evolution 2016-2018
x2 = np.arange(3)  # the label locations
width = 0.25  # the width of the bars

ax2.bar(x2 + 0, internet_connection.values[0], width=width, label=labels[0], color='rebeccapurple')
ax2.bar(x2 + width, internet_connection.values[1], width=width, label=labels[1], color='mediumpurple')
ax2.bar(x2 + 2*width, internet_connection.values[2], width=width, label=labels[2], color='magenta')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax2.set_ylabel('Number of school districts', 
             fontsize = 12,
             color='grey')
ax2.set_title('2.2 Broadband Connections over Time', 
             fontsize = 14,
             color='slategrey')
ax2.set_xticks(x2+width)
ax2.set_xticklabels(['2016','2017','2018'], fontsize=14)
ax2.tick_params(colors='grey')
ax2.legend(labelcolor='grey', fontsize=14)
ax2.set_facecolor('ghostwhite')
ax2.spines[['top','right','bottom','left']].set_visible(False)
ax2.text(1.5,800, 'Most households already have\na good broadband connection\nand there has been not much evolution\nin the past years', fontsize=14, ha='center', color='red', backgroundcolor='white')

#Plot locale composition
locale_count = districts.groupby('locale')['district_id'].count().reindex(['City','Suburb','Town','Rural','NaN'])
ax3.axis('equal')
locale = locale_count.index
values = locale_count.values
explode = (0, 0, 0, 0, 0.1)
colors_locale = ['hotpink','mediumvioletred','pink','deeppink','firebrick']
ax3.pie(values,
       labels = locale,
       explode=explode,
       autopct='%1.1f%%',
       pctdistance=0.8,
       counterclock=False,
       startangle=90,
       colors=colors_locale,
       textprops={'color':"dimgray", 'size':"large"})
ax3.set_title('2.3 Percentage of Data per Locale in Study Data', 
             fontsize = 14,
             color='slategrey')
ax3.text(1,0.2, 'The majority of Americans lives\nin suburbs, therefore\nit is most represented locale\nin the study data', fontsize=14, ha='center', color='red', backgroundcolor='white')

#Plot connectivity based on economical status
connection_economy = districts.pivot_table(index='county_connections_ratio', columns='pct_free/reduced', values='district_id', aggfunc=np.count_nonzero, fill_value=0)
x4 = np.arange(6)  # the label locations
width = 0.25  # the width of the bars

ax4.bar(x4 + 0, connection_economy.values[0][0:-1], width=width, label=labels[0], color='rebeccapurple')
ax4.bar(x4 + width, connection_economy.values[1][0:-1], width=width, label=labels[1], color='mediumpurple')
ax4.bar(x4 + 2*width, connection_economy.values[2][0:-1], width=width, label=labels[2], color='magenta')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax4.set_xlabel('Percentage of students egligible for free or reduced-price lunch\nhigher percentage group - less income households', 
             fontsize = 12,
             color='grey')
ax4.set_ylabel('Number of school districts in sample data', 
             fontsize = 12,
             color='grey')
ax4.set_title('2.4 Broadband Connectivity based on Household Economical Status', 
             fontsize = 14,
             color='slategrey')
ax4.set_xticks(x4+width)
ax4.set_xticklabels(['0', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'], fontsize=14)
ax4.tick_params(colors='grey')
ax4.legend(labelcolor='grey', fontsize=14)
ax4.set_facecolor('ghostwhite')
ax4.spines[['top','right','bottom','left']].set_visible(False)
ax4.text(4.5,25, 'The economical status\nof households does not impact\nbroadband access', fontsize=14, ha='center', color='red', backgroundcolor='white')

fig.set_facecolor('white')

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h2> The Effect of COVID-19 on Distance Learning</h2>
    <p>In this analysis the effect on distance learning is measured by <strong> engagement index</strong>, which is the total numberr of page-load events per one thousand students of a given learning service on a given day. Chart 3.1 represents the engagement index for the school districts for each day in 2020.</p>
    <ul>
        <li>Four periods in 2020 can easily be identified:<br>
            <strong>(I)</strong> - School year 2019/2020 before pandemic spread and school closures<br>
            <strong>(II)</strong> - School year 2019/2020 after pandemic school closures<br>
            <strong>(III)</strong> - Holiday period<br>
            <strong>(IV)</strong> - School year 2020/2021 before seasonal holiday<br></li><br>
        <li>Overall, the averaged engagement index increased by <strong> 88% </strong> in the period right after school closures (period (II) <br>and <strong> 164% </strong> after the holidays (period (IV) ) compared to pre-pandemic period. </li>
        <br>
        <li>The waviness of the line is a result is due to weekdays and weekends learning pattern.</li><br>
        <p><em>Mid February is considered as school closures date based on <a href="https://www.edweek.org/leadership/the-coronavirus-spring-the-historic-closing-of-u-s-schools-a-timeline/2020/07">EducationWeek</a></em></p>
    </ul>
</span>

In [None]:
#Prepare current status graph
engagement_byday = all_data.groupby('time')['engagement_index'].sum()

#Important dates for analysis
start_1920 = datetime.fromisoformat('2020-01-02') #Restart school year 2019-2020 after New Year break
start_covid = datetime.fromisoformat('2020-02-17') #Date of school closures due to covid
end_1920 = datetime.fromisoformat('2020-06-06') #End of 2019-2020 school year
start_2021 = datetime.fromisoformat('2020-08-31') #Start of 2020-2021 school year
end_2021 = datetime.fromisoformat('2020-12-21') #Start of Christmas break

#Function to calculate average over period
def calculate_average_period(df, start_date, end_date):
    mask = (df.index >= start_date) & (df.index < end_date)
    return df.loc[mask].mean()

#Calculate average engagement index in periods
avg_1920_before = calculate_average_period(engagement_byday, start_1920, start_covid)
avg_1920_after = calculate_average_period(engagement_byday, start_covid, end_1920)
avg_2021_after = calculate_average_period(engagement_byday, start_2021, end_2021)

#Calculate percentage increase in engament
delta_1920 = int((avg_1920_after-avg_1920_before)/avg_1920_before*100)
delta_2021 = int((avg_2021_after-avg_1920_before)/avg_1920_before*100)

fig, ax = plt.subplots(figsize=(20,8))
#Main plot
ax.plot(engagement_byday,color='indigo',linewidth=0.5)

ax.set_facecolor('ghostwhite')
ax.spines[['top','right','bottom','left']].set_visible(False)

#Plot averages
ax.plot([start_1920,start_covid],[avg_1920_before,avg_1920_before],color='rebeccapurple',linewidth=3)
ax.plot([start_covid,end_1920],[avg_1920_after,avg_1920_after],color='rebeccapurple',linewidth=3)
ax.plot([start_2021,end_2021],[avg_2021_after,avg_2021_after],color='rebeccapurple',linewidth=3)

#Plot increase info
ax.text(start_covid,avg_1920_before + (avg_1920_after-avg_1920_before)/2,s='↑ {}%'.format(delta_1920),fontsize=24,ha='right',va='center',color='firebrick')
ax.text(start_2021,avg_1920_before + (avg_2021_after-avg_1920_before)/2,s='↑ {}%'.format(delta_2021),fontsize=24,ha='right',va='center',color='firebrick')
ax.plot([start_covid,start_covid],[avg_1920_before, avg_1920_after],linewidth=3,color='firebrick')
ax.plot([start_2021,start_2021],[avg_1920_before, avg_2021_after],linewidth=3,color='firebrick')

#Plot reference lines
ax.plot([start_covid,start_covid],[0, avg_1920_after+6000000],linewidth=1.5,linestyle='dotted',color='grey')
ax.plot([end_1920,end_1920],[0, avg_1920_after+1000000],linewidth=1.5,linestyle='dotted',color='grey')
ax.plot([start_2021,start_2021],[0, avg_2021_after+1000000],linewidth=1.5,linestyle='dotted',color='grey')
ax.plot([end_2021,end_2021],[0, avg_2021_after+1000000],linewidth=1.5,linestyle='dotted',color='grey')
ax.plot([start_covid,start_2021],[avg_1920_before, avg_1920_before],linewidth=1.5,linestyle='dashed',color='grey')

#Plot school year
ax.add_patch(Rectangle([start_1920,0], end_1920-start_1920, -1900000, facecolor='lavender', edgecolor='rebeccapurple'))
ax.text(start_1920+(end_1920 - start_1920)/2, -1900000/2, s='School year 2019-2020', fontsize=10, ha='center', va='center', color='rebeccapurple')
ax.add_patch(Rectangle([start_2021,0], end_2021-start_2021, -1900000, facecolor='lavender', edgecolor='rebeccapurple'))
ax.text(start_2021+(end_2021 - start_2021)/2, -1900000/2, s='School year 2020-2021', fontsize=10, ha='center', va='center', color='rebeccapurple')
ax.text(start_covid, 19000000, s='School closures\ndue to COVID-19\n|\nV', fontsize=14, ha='center', va='top', color='firebrick')
ax.text(start_1920+(start_covid-start_1920)/2, 22000000, s='(I)', fontsize=20, ha='center', va='top', color='rebeccapurple')
ax.text(start_covid+(end_1920-start_covid)/2, 22000000, s='(II)', fontsize=20, ha='center', va='top', color='rebeccapurple')
ax.text(end_1920+(start_2021-end_1920)/2, 22000000, s='(III)', fontsize=20, ha='center', va='top', color='rebeccapurple')
ax.text(start_2021+(end_2021-start_2021)/2, 22000000, s='(IV)', fontsize=20, ha='center', va='top', color='rebeccapurple')
ax.text(datetime.fromisoformat('2020-07-31'), 24000000, s='Periods', fontsize=20, ha='center', va='top', color='rebeccapurple')

#Format main plot
ax.set_facecolor('ghostwhite')
ax.spines[['top','right','bottom','left']].set_visible(False)

#Format Axes
ax.set_title('3.1 Engagement index in selected school districts in 2020', 
             fontsize = 20,
             color='slategrey')
plt.ylabel('Total page-load events per 1000 students', 
             fontsize = 12,
             color='grey')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.set_xlim(engagement_byday.index.min()-timedelta(days = 5), engagement_byday.index.max()+timedelta(days = 5))
ax.set_ylim(-2000000, 25000000)
ax.tick_params(colors='grey')
fig.set_facecolor('white')
plt.show()

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <p> Let's look in more detail what type of online services students used the most (see chart 3.2). There are four categories:</p>
    <ul>
        <li>Learning & Curriculum</li>
        <li>School & District Operations</li>
        <li>Classroom Management</li>
        <li>LC/CM/SDO - combination of the three above categories</li>
    </ul>
</span>

In [None]:
#Calculate data for function category change due to COVID-19
function_categories=['Learning & Curriculum','School & District Operations','Classroom Management','LC/CM/SDO']
function_categories_colors = ['indigo','rebeccapurple','mediumorchid','plum']

#Sum by date
#function_evolution = all_data[all_data['dayofweek'].isin([1,2,3,4,5])].pivot_table(index=['time', 'Function Category'], values='engagement_index', aggfunc=np.sum, fill_value=0)
function_evolution = all_data.pivot_table(index=['time', 'Function Category'], values='engagement_index', aggfunc=np.sum, fill_value=0)
#Reset to turn index into collumns
function_evolution.reset_index(inplace=True)
#Add periods
function_evolution['period'] = pd.cut(function_evolution['time'],
       [start_1920,start_covid,end_1920,start_2021,end_2021],
       right=True,
       labels=['I', 'II', 'III', 'IV'])
#Average per period
function_evolution_pivot = function_evolution[function_evolution['period'].notna()].pivot_table(index='period', columns='Function Category', values='engagement_index', aggfunc=np.mean, fill_value=0)[function_categories]
#Add empty column in front to input labels
function_evolution_pivot.loc[''] = [0,0,0,0]

#Transpose dataframe to calculate percentage
fep_transposed = function_evolution_pivot.loc[['I','II','IV']].transpose()
#Calculate percentages in each period
for column in fep_transposed.columns:
    for row in fep_transposed.index:
        fep_transposed['perc_{}'.format(column)] = fep_transposed[column] / fep_transposed[column].sum() * 100
#Calculate percentage change between periods
fep_transposed['perc_ItoII'] = (fep_transposed['II'] - fep_transposed['I'])/fep_transposed['I']*100
fep_transposed['perc_IItoIV'] = (fep_transposed['IV'] - fep_transposed['II'])/fep_transposed['II']*100
#Transpose back to get a table of percentages
function_evolution_perc = fep_transposed[['perc_I', 'perc_II', 'perc_IV', 'perc_ItoII', 'perc_IItoIV']].transpose().astype('int')
#Calculate positions for text percentage change
fe_transposed = function_evolution_pivot.loc[['I','II','IV']].transpose()
fe_transposed['ItoII'] = (fe_transposed['I'] + fe_transposed['II'])/4 + (fe_transposed.shift(1).cumsum().fillna(0)['I']+fe_transposed.shift(1).cumsum().fillna(0)['II'])/2
fe_transposed['IItoIV'] = (fe_transposed['II'] + fe_transposed['IV'])/4 + (fe_transposed.shift(1).cumsum().fillna(0)['II']+fe_transposed.shift(1).cumsum().fillna(0)['IV'])/2
fe_positions = fe_transposed.transpose()

#Plot data
fig, ax = plt.subplots(figsize=(18,8))

#Plot bar chart
ax = function_evolution_pivot.loc[['', 'I','II','IV']].plot.bar(stacked=True,
                                                      color=function_categories_colors,
                                                      width=0.5,
                                                      ax=ax)

#Add category labels
ax.text(0,600000,function_categories[0], fontsize=16, ha='center', va='top', color='indigo') #, backgroundcolor='indigo')
ax.text(0,function_evolution_pivot.values[0,0]+500000,function_categories[1],fontsize=16, ha='center', va='top', color='rebeccapurple') #, backgroundcolor='rebeccapurple')
ax.text(0,function_evolution_pivot.values[0,0]+function_evolution_pivot.values[0,1]+500000,function_categories[2],fontsize=16, ha='center', va='top', color='mediumorchid') #, backgroundcolor='mediumorchid')
ax.text(0,function_evolution_pivot.values[0,0]+function_evolution_pivot.values[0,1]+function_evolution_pivot.values[0,2]+1000000,function_categories[3],fontsize=16, ha='center', va='top', color='plum') #, backgroundcolor='mediumorchid')

#Add reference lines
for i in range(0,2):
    for j in range(0,4):
        ax.plot([1.25+i,1.75+i],
            [function_evolution_pivot.loc[['I','II','IV']].values[i,:j+1].sum(), function_evolution_pivot.loc[['I','II','IV']].values[i+1,:j+1].sum()],
            linewidth=1.5,
            linestyle='dotted',
            color=function_categories_colors[j])

#Add percentage increase between periods
for i in range(0,2):
    for j in range(0,4):
        if i==0 and j==3:
            ax.text(i+1.5,fe_positions.values[i+3,j]*1.05, str(function_evolution_perc.values[i+3,j])+'%', ha='center', va='center', color=function_categories_colors[j], fontsize=16)
        else:
            ax.text(i+1.5,fe_positions.values[i+3,j], str(function_evolution_perc.values[i+3,j])+'%', ha='center', va='center', color=function_categories_colors[j], fontsize=16)

#Add comment
ax.text(0.3,8000000, 'Classroom Management online service\nwere subject to the largest increase\nin use duting COVID-19', fontsize=14, ha='center', color='red', backgroundcolor='white')

#Set Axis
ax.set_ylabel('Engagement index',fontsize = 14,color='grey')
ax.set_xlabel('Period', fontsize = 14,color='grey')
ax.set_title('3.2 Average Engagement Index per Online Platform Category in each Period in 2020', fontsize = 20,color='slategrey')
ax.set_xticklabels(labels=['', 'I - before COVID','II','IV'], fontsize=14, color='grey', rotation='horizontal')
ax.tick_params(colors='grey', bottom=False)
ax.legend().set_visible(False)
ax.set_facecolor('ghostwhite')
ax.spines[['top','right','bottom','left']].set_visible(False)

fig.set_facecolor('white')

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <p> Chart 3.3 shows top 5 products used by students in each function category:</p>
    <ul>Practically all products usage has increased. However, it is worth mentioning a few examples:
        <li><strong>Google Docs</strong> seems to be used most to share the educational material. This maybe due to its simplicity of use and accessibility. It is much more popular than Canvas or Seesaw.</li>
        <li><strong>Youtube</strong> was not used for education before COVID-19, but it seems after the initial online learning stage a lot of material were uploaded on the platform and it was very much used in period IV after the school holiday</li>
        <li>Video conferencing tools like <strong>Meet</strong> and <strong>Zoom</strong> gained on popularity. However, it overall use is much less than other platforms. This may indicate that major part of online education is through shared teaching material rather than live online classes</li>
        <li><strong>Kahoot!</strong> is the only product, which usage decreased during the pandemic. The service offers educational games, which were popular before COVID-19 as an optional resource. After the switch to online learning, this service use was replaced by school-distributed specific materials</li>
    </ul>
</span>

In [None]:
#Calculate data for function category change due to COVID-19

#Sum by date
product_evolution = all_data.pivot_table(index=['time', 'Function Category','Product Name'], values='engagement_index', aggfunc=np.sum, fill_value=0)
#Reset to turn index into columns
product_evolution.reset_index(inplace=True)
#Add periods
product_evolution['period'] = pd.cut(product_evolution['time'],
       [start_1920,start_covid,end_1920,start_2021,end_2021],
       right=True,
       labels=['I', 'II', 'III', 'IV'])
#Average per period
product_evolution_pivot = product_evolution[product_evolution['period'].notna()].pivot_table(index='period', columns=['Function Category','Product Name'], values='engagement_index', aggfunc=np.mean, fill_value=0)[function_categories]

product_evolution = product_evolution_pivot.loc[['I','II','IV']].transpose().reset_index()

top5products = product_evolution.sort_values(by=['Function Category','IV'], ascending=False).groupby('Function Category').head(5)
#Shorten long product names
productname_dict = {'Seesaw : The Learning Journal':'Seesaw',
                    'Chrome Web Store':'Chrome',
                    'Loom - Video Recorder: Screen, Webcam and Mic':'Loom',
                    'Google Classroom':'G Classroom',
                    'Google Calendar':'G Calendar',
                    'Google Hangouts':'G Hangouts',
                    'Google Docs':'G Docs',
                    'Google Sites':'G Sites',
                    'Google Forms':'G Forms',
                    'Google Sheets':'G Sheets',
                    'Google Drive':'G Drive'}
top5products['Product Name'] = top5products['Product Name'].replace(productname_dict)

#Plot Graph
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(20,10), sharey=True)

x = np.arange(5)  # the label locations
width = 0.25  # the width of the bars
plt.ylim(-200000, 3700000)

#Plot1
category = 0
ax1.bar(x -0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[1], width=width, color=function_categories_colors[category])
ax1.bar(x + width, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[2], width=width, color=function_categories_colors[category])
ax1.bar(x + width*2 + 0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[3], width=width, color=function_categories_colors[category])
#Add period text
for i in x:
    ax1.text(i -0.02, -150000, 'I', ha='center', color='dimgrey', fontsize=12)
    ax1.text(i + width, -150000, 'II', ha='center', color='dimgrey', fontsize=12)
    ax1.text(i + width*2 + 0.02, -150000, 'IV', ha='center', color='dimgrey', fontsize=12)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax1.set_ylabel('Engagement index', fontsize = 12, color='grey')
ax1.set_title(function_categories[category], fontsize = 18, color='slategrey')
ax1.set_xticks(x+0.25)
ax1.set_xticklabels(top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[0], fontsize=14)
ax1.tick_params(colors='grey')
ax1.set_facecolor('ghostwhite')
ax1.spines[['top','right','bottom','left']].set_visible(False)
ax1.text(1,2500000,'- Google Docs is the dominant product in this category\n- YouTube became much used after holday break\n- Kahoot! is the only product, which usage decreased', fontsize=13, color='red', backgroundcolor='white')

#Plot2
category = 1
ax2.bar(x -0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[1], width=width, color=function_categories_colors[category])
ax2.bar(x + width, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[2], width=width, color=function_categories_colors[category])
ax2.bar(x + width*2 + 0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[3], width=width, color=function_categories_colors[category])
#Add period text
for i in x:
    ax2.text(i -0.02, -150000, 'I', ha='center', color='dimgrey', fontsize=12)
    ax2.text(i + width, -150000, 'II', ha='center', color='dimgrey', fontsize=12)
    ax2.text(i + width*2 + 0.02, -150000, 'IV', ha='center', color='dimgrey', fontsize=12)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax2.set_ylabel('Engagement index', fontsize = 12, color='grey')
ax2.set_title(function_categories[category], fontsize = 18, color='slategrey')
ax2.set_xticks(x+0.25)
ax2.set_xticklabels(top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[0], fontsize=14)
ax2.tick_params(colors='grey')
ax2.set_facecolor('ghostwhite')
ax2.spines[['top','right','bottom','left']].set_visible(False)
ax2.text(1,2500000,'- Google Classroom is the dominant product\nin this categoryfollowed by Schoology.\nOther products are insignificant', fontsize=13, color='red', backgroundcolor='white')

#Plot3
category = 2
ax3.bar(x -0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[1], width=width, color=function_categories_colors[category])
ax3.bar(x + width, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[2], width=width, color=function_categories_colors[category])
ax3.bar(x + width*2 + 0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[3], width=width, color=function_categories_colors[category])
#Add period text
for i in x:
    ax3.text(i -0.02, -150000, 'I', ha='center', color='dimgrey', fontsize=12)
    ax3.text(i + width, -150000, 'II', ha='center', color='dimgrey', fontsize=12)
    ax3.text(i + width*2 + 0.02, -150000, 'IV', ha='center', color='dimgrey', fontsize=12)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax3.set_ylabel('Engagement index', fontsize = 12, color='grey')
ax3.set_title(function_categories[category], fontsize = 18, color='slategrey')
ax3.set_xticks(x+0.25)
ax3.set_xticklabels(top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[0], fontsize=14)
ax3.tick_params(colors='grey')
ax3.set_facecolor('ghostwhite')
ax3.spines[['top','right','bottom','left']].set_visible(False)
ax3.text(1,2500000,'- Product in this category are significantly\nless used than two previous.', fontsize=13, color='red', backgroundcolor='white')

#Plot4
category = 3
ax4.bar(x -0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[1], width=width, color=function_categories_colors[category])
ax4.bar(x + width, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[2], width=width, color=function_categories_colors[category])
ax4.bar(x + width*2 + 0.02, top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[3], width=width, color=function_categories_colors[category])
#Add period text
for i in x:
    ax4.text(i -0.02, -150000, 'I', ha='center', color='dimgrey', fontsize=12)
    ax4.text(i + width, -150000, 'II', ha='center', color='dimgrey', fontsize=12)
    ax4.text(i + width*2 + 0.02, -150000, 'IV', ha='center', color='dimgrey', fontsize=12)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax4.set_ylabel('Engagement index', fontsize = 12, color='grey')
ax4.set_title(function_categories[category], fontsize = 18, color='slategrey')
ax4.set_xticks(x+0.25)
ax4.set_xticklabels(top5products[top5products['Function Category'] == function_categories[category]][['Product Name','I','II','IV']].transpose().values[0], fontsize=14)
ax4.tick_params(colors='grey')
ax4.set_facecolor('ghostwhite')
ax4.spines[['top','right','bottom','left']].set_visible(False)
ax4.text(1,2500000,'- Google Sheets and Drive appear to be\nmost popular products\combining all functions', fontsize=13, color='red', backgroundcolor='white')

fig.suptitle('3.3 Top 5 Products per Function Category in each Period of 2020', fontsize = 20,color='slategrey')
fig.set_facecolor('white')
plt.plot()

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h2> Socioeconomic status vs online learning</h2>
    <p>Four socioeconomic groups are considered in the analysis. Based on below charts a slight correlation can be observed between the groups and percentage of students who access e-learning products:</p>
    <table style='font-family:Lucida; color:slategray; font-size: 18px;'>
        <tr>
            <th>Socioeconomic group</th>
            <th>Percentage students in group</th>
            <th>Percentage students who accessed e-learning products</th>
        </tr>
        <tr>
            <td>Students identified as Black or Hispanic based</td>
            <td style='text-align: center;'><strong>↑</strong></td>
            <td style='text-align: center;'><strong>↓</strong></td>
        </tr>
        <tr>
            <td>Students eligible for free or reduced-price lunch</td>
            <td style='text-align: center;'><strong>↑</strong></td>
            <td style='text-align: center;'><strong>↓</strong></td>
        </tr>
        <tr>
            <td>Per-student expenditure range</td>
            <td style='text-align: center;'><strong>↑</strong></td>
            <td style='text-align: center;'><strong>↑</strong></td>
        </tr>
        <tr>
            <td>Area the students reside in</td>
            <td colspan="2">Highest percentage of students who access e-learing is in rural areas</td>
        </tr>
    </table>
    <p>Note:<br><em>In general, the percentage of students, who accessed at least one e-learning product, is surprisingly low in range of 0-3.5%. This is against logic, considering that the schools were closed, but the education continued.</em></p>
</span>
    

In [None]:
fig, ((ax1, ax2),(ax3, ax4)) = plt.subplots(2,2, figsize=(20,16))

#Black/hispanic
#Calculation
access_hispanic = all_data.groupby(['district_id','pct_black/hispanic'])['pct_access'].mean().reset_index()
access_hispanic_count = access_hispanic.groupby('pct_black/hispanic')['pct_access'].median()
#Plot
ax1.scatter(access_hispanic.sort_values('pct_black/hispanic', ascending=True)['pct_black/hispanic'], access_hispanic.sort_values('pct_black/hispanic', ascending=True)['pct_access'], alpha=0.05, s=200, color='rebeccapurple')
ax1.scatter(access_hispanic_count.index, access_hispanic_count.values, c='red', s=100)
ax1.set_title('4.1.1 Black or Hispanic based students', fontsize = 16, color='dimgrey')
ax1.set_xlabel('Percentage range of Black or Hispanic based students in district', fontsize = 12, color='grey')
ax1.set_ylabel('Percentage of students\nthat accessed educational product', fontsize = 12, color='grey')
ax1.tick_params(colors='grey', axis='x', labelsize=14)
ax1.set_facecolor('ghostwhite')
ax1.spines[['top','right','bottom','left']].set_visible(False)
ax1.text(0.5,2.5, 'The higher the number of Black/Hispanic\nstudents in district, the lower percentage of\nstudents who accessed online learning.\nException:\nDistricts in which Black/Hispanic students\npercentage is close to 100', fontsize=14, ha='left', color='red', backgroundcolor='white')
ax1.plot([0,3],[access_hispanic_count[0],access_hispanic_count[3]], linestyle='dashed', color='red', linewidth=2)

#free/reduced
#Calculation
access_reduced = all_data.groupby(['district_id','pct_free/reduced'])['pct_access'].mean().reset_index()
access_reduced_count = access_reduced.groupby('pct_free/reduced')['pct_access'].median()
#Plot
ax2.scatter(access_reduced.sort_values('pct_free/reduced', ascending=True)['pct_free/reduced'], access_reduced.sort_values('pct_free/reduced', ascending=True)['pct_access'], alpha=0.1, s=200, color='rebeccapurple')
ax2.scatter(access_reduced_count.index, access_reduced_count.values, c='red', s=100)
ax2.set_title('4.1.2 Percentage of students in the districts eligible for free or reduced-price lunch', fontsize = 16, color='dimgrey')
ax2.set_xlabel('Percentage range of subsidized students in district', fontsize = 12, color='grey')
ax2.set_ylabel('Percentage of students\nthat accessed educational product', fontsize = 12, color='grey')
ax2.tick_params(colors='grey', axis='x', labelsize=14)
ax2.set_facecolor('ghostwhite')
ax2.spines[['top','right','bottom','left']].set_visible(False)
ax2.text(0.5,2.9, 'The higher the number of student in need of help in district,\nthe lower percentage of students who accessed\nonline learning.\nException:\ndistricts in which almost 100% students is subsidized', fontsize=14, ha='left', color='red', backgroundcolor='white')
ax2.plot([0,4],[access_reduced_count[0],access_reduced_count[3]], linestyle='dashed', color='red', linewidth=2)

#pp_total_raw
#Calculation
access_expenditure = all_data.groupby(['district_id','pp_total_raw'])['pct_access'].mean().reset_index()
access_expenditure_count = access_expenditure.groupby('pp_total_raw')['pct_access'].median()
#Sort by range
access_expenditure['sort'] = access_expenditure['pp_total_raw'].str.split('-', expand=True)[0]
access_expenditure['sort'] = access_expenditure['sort'].replace({'NA':1000})
access_expenditure['sort'] = access_expenditure['sort'].astype('int')
access_expenditure.sort_values(by='sort', inplace=True)
#Plot
ax3.scatter(access_expenditure['pp_total_raw'], access_expenditure['pct_access'], alpha=0.1, s=200, color='rebeccapurple')
ax3.scatter(access_expenditure_count.index, access_expenditure_count.values, c='red', s=100)
ax3.set_title('4.1.3 Per-pupil local and federal expenditure', fontsize = 16, color='dimgrey')
ax3.set_xlabel('Per-pupil total expenditure range', fontsize = 12, color='grey')
ax3.set_ylabel('Percentage of students\nthat accessed educational product', fontsize = 12, color='grey')
ax3.tick_params(colors='grey', axis='x', labelsize=12)
ax3.set_facecolor('ghostwhite')
ax3.spines[['top','right','bottom','left']].set_visible(False)
ax3.text(0.5,2.9, 'The higher expenditure per student, the higher access rate', fontsize=14, ha='left', color='red', backgroundcolor='white')
ax3.plot([2,3,4,5,6,7,8,9],access_expenditure.groupby('sort')['pct_access'].median()[2:10], linestyle='dashed', color='red', linewidth=2)

#locale
access_locale = all_data.groupby(['district_id','locale'])['pct_access'].mean().reset_index()
access_locale_count = access_locale.groupby('locale')['pct_access'].median()
#Sort by locale
locale_sort = {'Suburb':2, 'NaN':5, 'Rural':4, 'City':1, 'Town':3}
access_locale['sort'] = access_locale['locale'].replace(locale_sort)
access_locale.sort_values(by='sort', inplace=True)
#Plot
ax4.scatter(access_locale['locale'], access_locale['pct_access'], alpha=0.1, s=200, color='rebeccapurple')
ax4.scatter(access_locale_count.index, access_locale_count.values, c='red', s=100)
ax4.set_title('4.1.4 Per-pupil total expenditure', fontsize = 16, color='dimgrey')
ax4.set_xlabel('Percentage of students in the districts eligible for free or reduced-price lunch', fontsize = 12, color='grey')
ax4.set_ylabel('Percentage of students\nthat accessed educational product', fontsize = 12, color='grey')
ax4.tick_params(colors='grey', axis='x', labelsize=14)
ax4.set_facecolor('ghostwhite')
ax4.spines[['top','right','bottom','left']].set_visible(False)
ax4.text(0.4,2.9, 'Highest percentage of students that access online learning\nlives in rural area, followed by studends living in suburbs', fontsize=14, ha='left', color='red', backgroundcolor='white')

#fig.tight_layout(pad=1.0)
fig.suptitle('4.1 Socioeconomic status and online learning relationship ', fontsize = 20, color='slategrey')
fig.set_facecolor('white')

<span style='font-family:Lucida; color:slategray; font-size: 18px;'>
    <h2> Conclusions</h2>
    <ul>
        <li>The use of e-learning has increased roughly three times during COVID-19. The future trend depends on the pandemic development, however, considering very low percentage of students (0-3.5%) who access online education, there is big potential for further growth.</li>
        <li>US households have good digital connection to face the online learning necessity</li>
        <li>There is need to further understand the reason and equalize the students engagement in different socioeconomic groups</li>
        <li>The most used products for digital learning are of general use (Google Docs, YouTube,Google Forms). Are they appropriate for this purpose or is there a need for dedicated products?</li>
        <li>The use of video conferencing for e-learning seems very low. This may indicate the teaching material was distributed to students for self-study at home. Should the e-learning change to include more 'live' connections?</li>
    </ul>
</span> 
