In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import plotly.express as px
from collections import OrderedDict
# !pip install chart_studio
!pip install openpyxl
# import chart_studio.plotly as pl
import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
# init_notebook_mode(connected=True)
%matplotlib inline

**Resources used other than kaggle datasets**

https://data.unicef.org/resources/children-and-young-people-internet-access-at-home-during-covid19/

https://en.unesco.org/covid19/educationresponse

https://www.weforum.org/agenda/2020/04/coronavirus-education-global-covid19-online-digital-learning/

# COVID-19 Impact on Digital Learning
As the humanity progresses rapidly we completely ignore nature that provides us with everything we need, starting with oxygen,food,natural resources..,etc and still we don't give the respect it deserves. Nature has also been stringent like a mother giving us occasional punishments in some parts of the world. A earthquake in Indonesia, Volcano in spain. But the impact of the earthquake in Indonesia is not felt in India. This time the nature had other plans and united the entire humanity under one name-COVID-19. This was totally unexpected but took 1-2 years of our lives.It has not reached its end yet. But the vigour has reduced and thanks to Science, Scientists and Vaccines. Leave the impacts of the COVID in our lives aside. We should be thankful to God that we are atleast living now as this pandemic has killed 4.5 million lives. 

The pandemic has completely disrupted one of the most important systems that is crucial for an individual growth and therby to a society's progress and theryby for the welfare of the country-EDUCATION.**UNESCO - One year into the COVID-19 pandemic, close to half the world’s students are still affected by partial or full school closures, and over 100 million additional children will fall below the minimum proficiency level in reading as a result of the health crisis.** Such is the profound impact of the COVID-19 pandemic. There has been an enormous rise in online learning platforms and we could see colleges providing oppurtunities to get degree via these platforms. Even before pandemic online education was gaining steam. **In 2019 a study named  "Online Education Market & Global Forecast, by End User, Learning Mode (Self-Paced, Instructor Led), Technology, Country, Company" reported that the value of Online Education Market will reach 350 Billion US$ by 2025.**

Eventhough we started making transition to digital learning the important questions to be asked are "Is equity to education being maintained? Is digital learning being accessible and feasible for every student in the world?". **For example, whilst 95% of students in Switzerland, Norway, and Austria have a computer to use for their schoolwork, only 34% in Indonesia do, according to OECD data.** According to Global internet network research, **just 24% of households had a consistent internet connection**, while other remote areas remained unreachable.

# **Please upvote if you find it useful**

In [None]:
df1000=pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1000.csv')
dfDistrict=pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
dfProducts=pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
dfProducts.rename(columns={'LP ID':'lp_id'},inplace=True)

In [None]:
#df1000[df1000.isnull().any(axis=1)] ##Returns all rows in the dataframe if null is present in even one column
df1000[df1000['lp_id'].isnull()].index ##Returns rows where the particular column value is null

In [None]:
##Does Loading and preprocessing of the distric level data files
def load(link):
    df=pd.read_csv(link)
    for i in df[df['lp_id'].isnull()].index:
        df.drop(i,axis=0,inplace=True)
    df.fillna(0.0,inplace=True)
    return df

In [None]:
# data_slider = []
# for date in countryWiseSchoolsStatusDF['Date'].unique()[:100]:

#     df_date = countryWiseSchoolsStatusDF[countryWiseSchoolsStatusDF['Date'] == date]

#     for col in df_date.columns:
#         df_date[col] = df_date[col].astype(str)

#     dataPerDay = dict( type='choropleth',
#                         locations = df_date['Country'],
#                         z=df_date['Status'].astype(float),
#                         locationmode='country names',
#                         colorscale = "Viridis",
#                         )

#     data_slider.append(dataPerDay)
    
# steps = []

# for i in range(len(data_slider)):
#     step = dict(method='restyle',
#                 args=['visible', [False] * len(data_slider)])
#     step['args'][1][i] = True
#     steps.append(step)

# sliders = [dict(active=0, pad={"t": 1}, steps=steps)] 

# layout = dict(geo=dict(scope='world',
#                        showcountries = True,
#                        projection={'type': 'mercator'}),
#               sliders=sliders)

# fig = dict(data=data_slider, layout=layout)
# iplot(fig, show_link = True)

The below section of plots gives a small understanding on closure of schools and internet access among students and how class differences play a significant role here. Datasets for these plots have been taken from UNESCO and UNICEF sites

# The below plots shows how long schools have been closed in countries classified on the basis of economic level of the countires. 

In [None]:
df=pd.read_excel('../input/school-closures-unicef/School-Closures.xlsx',header=1)
df['Income Group'].fillna('Lower middle income (LM)',inplace=True) ##Two countries dont have income group. So filled with the respective group

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Pre-primary', x=df['Income Group'], y=df['Pre-primary'],hovertext=df['UNICEF Country']),
    go.Bar(name='Primary', x=df['Income Group'], y=df['Primary'],hovertext=df['UNICEF Country']),
    go.Bar(name='Lower Secondary', x=df['Income Group'], y=df['Lower Secondary'],hovertext=df['UNICEF Country']),
    go.Bar(name='Upper Secondary', x=df['Income Group'], y=df['Upper Secondary'],hovertext=df['UNICEF Country'])
])
# Change the bar mode
fig.update_traces(textposition=['outside'])
fig.update_layout(barmode='group',title_text='No of Students who have been affected by School Closure')
fig.show()

> **From above plot it is evident that students in high income regions have not been affected much by school closure. Primary children have been significantly affected in all areas. But students in middle income groups have suffered a lot.**

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Days: Academic break', x=df['Income Group'], y=df['Days: Academic break'],hovertext=df['UNICEF Country']),
    go.Bar(name='Days: Fully closed', x=df['Income Group'], y=df['Days: Fully closed'],hovertext=df['UNICEF Country']),
    go.Bar(name='Days: Fully open', x=df['Income Group'], y=df['Days: Fully open'],hovertext=df['UNICEF Country']),
    go.Bar(name='Days:  Partially closed', x=df['Income Group'], y=df['Days:  Partially closed'],hovertext=df['UNICEF Country'])
])
# Change the bar mode
fig.update_traces(textposition=['outside'])
fig.update_layout(barmode='group',title_text='No of Days Schools have been closed across regions(As of March 2021)')
fig.show()

> **The above plot corresponds to the information presented by the previous plot. The no of days schools were fully open increases across income levels(low to high). The no of days schools were partially open or fully open is higher in high income countries.**

**From the plots we can observe that middle income countries have more students affected due to school closure and schools have also been closed for long periods of time. Some of the countries in this region are India, Bangladesh, Indonesia. The highly dense population could have made the Government to not completely reopen schools.**

**Whereas the high income countries have vaccinated their citizens more widely and they could maintain better Covid safety norms due to their wealthy status and also these countries have reduced population comapred to low and middle income countires.**

# The below plot shows how internet access to students varies across countries based on their income levels

**2.2 billion children and students(2 out of 3) aged 25 years or less – do not have internet access at home. Across the world only 33% of children and students have internet access at home and out of which students having internet access in high income countries is 87% whereas in low income countries it is 6%.**

In [None]:
PrimaryDF=pd.read_excel('../input/digital-connectivity/School-Age-Digital-Connectivity.xlsx',header=0,sheet_name='Primary')
fig = go.Figure(data=[
    go.Bar(name='Rural', x=PrimaryDF['Income Group'], y=PrimaryDF['Rural'],hovertext=PrimaryDF['Country']),
    go.Bar(name='Urban', x=PrimaryDF['Income Group'], y=PrimaryDF['Urban'],hovertext=PrimaryDF['Country']),
    go.Bar(name='Poorest', x=PrimaryDF['Income Group'], y=PrimaryDF['Poorest'],hovertext=PrimaryDF['Country']),
    go.Bar(name='Richest', x=PrimaryDF['Income Group'], y=PrimaryDF['Richest'],hovertext=PrimaryDF['Country'])
])
# Change the bar mode
fig.update_traces(textposition=['outside'])
fig.update_layout(barmode='group',title_text='Percentage of Primary children with internet connectivity at home')
fig.show()

In [None]:
SecondaryDF=pd.read_excel('../input/digital-connectivity/School-Age-Digital-Connectivity.xlsx',header=0,sheet_name='Upper Secondary')
fig = go.Figure(data=[
    go.Bar(name='Rural', x=SecondaryDF['Income Group'], y=SecondaryDF['Rural'],hovertext=SecondaryDF['Country']),
    go.Bar(name='Urban', x=SecondaryDF['Income Group'], y=SecondaryDF['Urban'],hovertext=SecondaryDF['Country']),
    go.Bar(name='Poorest', x=SecondaryDF['Income Group'], y=SecondaryDF['Poorest'],hovertext=SecondaryDF['Country']),
    go.Bar(name='Richest', x=SecondaryDF['Income Group'], y=SecondaryDF['Richest'],hovertext=SecondaryDF['Country'])
])
# Change the bar mode
fig.update_traces(textposition=['outside'])
fig.update_layout(barmode='group',title_text='Percentage of Higher Secondary students with internet connectivity at home')
fig.show()

> **From the above plots we could see that in every region for both primary children and teenagers the urban people have higher internet access compared to rural people and also richer people have good internet access compared to poor people. Hence Access to the internet varies widely depending on country income group, the rural-urban divide as well as household wealth.**

The below set of plots will try to comprehensively analyse the products used across districts based on their engagement index,usage and percentage of students accessing it, the product providers and how engagement index varied during covid times etc.

# The below section depicts the most used products based on the no of days they have been used across all districts
The products have been categorized into no of districts the product has been used in, i.e above 50 districts,between 10 and 50 districts and less than 10 districts. The no of days a product has been used is added across all districts and it has been averaged. Only the top 20 products in each district has been taken into consideration and averaged.

In [None]:
##Names of the products accessed the most time. (Top 20). Some products in top 20 have no names. So they have been omitted(if Condition)

districtDataSummary=OrderedDict() ##This dictionary holds the most used products and the total days they have been used(Summation of Days)

productsUsedAlongWithDaysTop=OrderedDict() ##Takes the value from the previous dictionary and averages it. So that we can 
                                    ##get how many days a product can be across all districts. But the product should be used in above 50 districts
productsUsedAlongWithDaysMiddle=OrderedDict() ##The product should be used in 10-50 districts
productsUsedAlongWithDaysBottom=OrderedDict() ##The product should be used below 10 districts

noOfDistricts=OrderedDict()
    
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        lpIdCountDF=pd.DataFrame(df.groupby('lp_id').count().sort_values('time',ascending=False)['time'])
        lpIdCountDF.rename(columns={'time':'Count'},inplace=True)
        lpIdCountDF.reset_index(inplace=True)
        for i in range(20):
            lpId=lpIdCountDF['lp_id'][i]
            count=lpIdCountDF['Count'][i]
            if lpId in dfProducts['lp_id'].values:
                productName=dfProducts[dfProducts['lp_id']==lpId]['Product Name'].values[0]
                #print(lpId,' ',productName,' ',count)
                if productName not in districtDataSummary.keys():
                    districtDataSummary[productName]=count
                else:
                    districtDataSummary[productName]=districtDataSummary[productName]+count
                if productName not in noOfDistricts.keys():
                    noOfDistricts[productName]=1
                else:
                    noOfDistricts[productName]=noOfDistricts[productName]+1
        #print('*****************')
        

for i,j in districtDataSummary.items():
    if noOfDistricts[i]>=50:
        productsUsedAlongWithDaysTop[i]=j/noOfDistricts[i]
    elif noOfDistricts[i]>10 and noOfDistricts[i]<50:
        productsUsedAlongWithDaysMiddle[i]=j/noOfDistricts[i]
    elif noOfDistricts[i]<=10:
        productsUsedAlongWithDaysBottom[i]=j/noOfDistricts[i]

In [None]:
##Converts the dictionary to dataframe
productsDaysDFTop=pd.DataFrame(productsUsedAlongWithDaysTop,index=[0])
productsDaysDFTop=productsDaysDFTop.transpose()
productsDaysDFTop.reset_index(inplace=True)
productsDaysDFTop.rename(columns={0:'No of Days'},inplace=True)
productsDaysDFTop=productsDaysDFTop.sort_values(by='No of Days',ascending=False)
productsDaysDFTop.reset_index(inplace=True)
productsDaysDFTop.drop(['level_0'],axis=1,inplace=True)

productsDaysDFMiddle=pd.DataFrame(productsUsedAlongWithDaysMiddle,index=[0])
productsDaysDFMiddle=productsDaysDFMiddle.transpose()
productsDaysDFMiddle.reset_index(inplace=True)
productsDaysDFMiddle.rename(columns={0:'No of Days'},inplace=True)
productsDaysDFMiddle=productsDaysDFMiddle.sort_values(by='No of Days',ascending=False)
productsDaysDFMiddle.reset_index(inplace=True)
productsDaysDFMiddle.drop(['level_0'],axis=1,inplace=True)

productsDaysDFBottom=pd.DataFrame(productsUsedAlongWithDaysBottom,index=[0])
productsDaysDFBottom=productsDaysDFBottom.transpose()
productsDaysDFBottom.reset_index(inplace=True)
productsDaysDFBottom.rename(columns={0:'No of Days'},inplace=True)
productsDaysDFBottom=productsDaysDFBottom.sort_values(by='No of Days',ascending=False)
productsDaysDFBottom.reset_index(inplace=True)
productsDaysDFBottom.drop(['level_0'],axis=1,inplace=True)

In [None]:
fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(-20,-1.5,'The products used most days across all districts(Product used in above 50 districts)',fontfamily='sans-serif',
                      fontsize=20,fontweight='bold',color='#323232')
ax.text(-5,-0.8,'Google products have been used across most districts',fontfamily='sans-serif',
                      fontsize=16,fontweight='medium',color='#323232')
img=sns.barplot(y='index',x='No of Days',data=productsDaysDFTop[:20],ax=ax)
for i in range(20):
    days=str(int(productsDaysDFTop.loc[i]['No of Days']))+' Days'
    ax.text(30,i,days,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel='Products',xticklabels=[],xlabel=None)
ax.text(-20,21,'The no of days a particular product has been used across all districts is averaged',
        fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(-20,-1.5,'The products used most days across all districts(Product used in between 10 and 50 districts)',fontfamily='sans-serif',
                      fontsize=20,fontweight='bold',color='#323232')
img=sns.barplot(y='index',x='No of Days',data=productsDaysDFMiddle[:20],ax=ax)
for i in range(20):
    days=str(int(productsDaysDFMiddle.loc[i]['No of Days']))+' Days'
    ax.text(30,i,days,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel='Products',xticklabels=[],xlabel=None)
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(-20,-1.5,'The products used most days across all districts(Product used in less than 10 districts)',fontfamily='sans-serif',
                      fontsize=20,fontweight='bold',color='#323232')
img=sns.barplot(y='index',x='No of Days',data=productsDaysDFBottom[:20],ax=ax)
for i in range(20):
    days=str(int(productsDaysDFBottom.loc[i]['No of Days']))+' Days'
    ax.text(30,i,days,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel='Products',xticklabels=[],xlabel=None)
img.tick_params(left=False,bottom=False)

# The following section presents the most used products in all districts.
The product and the no of districts it has been used has been considered. The top 20 products is plotted.

In [None]:
noOfDistrictsDF=pd.DataFrame(noOfDistricts,index=[0])
noOfDistrictsDF=noOfDistrictsDF.transpose()
noOfDistrictsDF.rename(columns={0:'No of Districts'},inplace=True)
noOfDistrictsDF=noOfDistrictsDF.sort_values(by='No of Districts',ascending=False)
noOfDistrictsDF.reset_index(inplace=True)
noOfDistrictsDF.rename(columns={'index':'Product Name'},inplace=True)
noOfDistrictsDF=pd.merge(noOfDistrictsDF,dfProducts[['Product Name','Provider/Company Name']],on='Product Name',how='inner')

In [None]:
fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(-20,-1.5,'The products used in most districts',fontfamily='sans-serif',
                      fontsize=20,fontweight='bold',color='#323232')
img=sns.barplot(y='Product Name',x='No of Districts',data=noOfDistrictsDF[:20],ax=ax)
for i in range(20):
    districts=str(int(noOfDistrictsDF.loc[i]['No of Districts']))+' Districts'
    ax.text(30,i,districts,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel='Products',xticklabels=[],xlabel=None)
img.tick_params(left=False,bottom=False)

# Though Google products are being superiorly accessed in most districts some products are being used in lesser districts.The below plot shows the Google products used in very less districts

In [None]:
googleProductsInDistrictsDF=noOfDistrictsDF[(noOfDistrictsDF['Provider/Company Name'].str.contains('Google')) & (noOfDistrictsDF['No of Districts']<50)]
googleProductsInDistrictsDF.reset_index(inplace=True)

fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(-10,-1.5,'Google products being accessed in fewer districts',fontfamily='sans-serif',
                      fontsize=20,fontweight='bold',color='#323232')
img=sns.barplot(y='Product Name',x='No of Districts',data=googleProductsInDistrictsDF,ax=ax)
for i in range(len(googleProductsInDistrictsDF)):
    districts=str(googleProductsInDistrictsDF.loc[i]['No of Districts'])+' Districts'
    ax.text(5,i,districts,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel='Products',xticklabels=[],xlabel=None)
img.tick_params(left=False,bottom=False)
        

# The below set of code helps to identify the the products used the most no of days (sector wise).
The no of days a product has been used for in all districts has been averaged and then grouped by primary essential function(sectors). Top 3 most used products based on days in each sector has been plotted. Only the sectors with more than 5 products has been considered for plotting.

In [None]:
districtDataSummary2=OrderedDict() ##This dictionary holds the most used products and the total days they have been used(Summation of Days)

productsUsedAlongWithDaysTop2=OrderedDict() ##Takes the value from the previous dictionary and averages it. So that we can 
                                    ##get how many days a product can be across all districts. But the product should be used in above 50 districts
productsUsedAlongWithDaysMiddle2=OrderedDict() ##The product should be used in 10-50 districts
productsUsedAlongWithDaysBottom2=OrderedDict() ##The product should be used below 10 districts

noOfDistricts2=OrderedDict()
    
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        lpIdCountDF2=pd.DataFrame(df.groupby('lp_id').count().sort_values('time',ascending=False)['time'])
        lpIdCountDF2.rename(columns={'time':'Count'},inplace=True)
        lpIdCountDF2.reset_index(inplace=True)
        for i in range(len(lpIdCountDF2)):
            lpId=lpIdCountDF2['lp_id'][i]
            count=lpIdCountDF2['Count'][i]
            if lpId in dfProducts['lp_id'].values:
                productName=dfProducts[dfProducts['lp_id']==lpId]['Product Name'].values[0]
                #print(lpId,' ',productName,' ',count)
                if productName not in districtDataSummary2.keys():
                    districtDataSummary2[productName]=count
                else:
                    districtDataSummary2[productName]=districtDataSummary2[productName]+count
                if productName not in noOfDistricts2.keys():
                    noOfDistricts2[productName]=1
                else:
                    noOfDistricts2[productName]=noOfDistricts2[productName]+1
        #print('*****************')
        

for i,j in districtDataSummary2.items():
    if noOfDistricts2[i]>=50:
        productsUsedAlongWithDaysTop2[i]=j/noOfDistricts2[i]
    elif noOfDistricts2[i]>10 and noOfDistricts2[i]<50:
        productsUsedAlongWithDaysMiddle2[i]=j/noOfDistricts2[i]
    elif noOfDistricts2[i]<=10:
        productsUsedAlongWithDaysBottom2[i]=j/noOfDistricts2[i]

In [None]:
sectorWiseDF=pd.DataFrame(productsUsedAlongWithDaysTop2,index=[0])
sectorWiseDF=sectorWiseDF.transpose()
sectorWiseDF.reset_index(inplace=True)
sectorWiseDF.rename(columns={0:'No of Days','index':'Product Name'},inplace=True)
sectorWiseDF=sectorWiseDF.sort_values(by='No of Days',ascending=False)
sectorWiseDF.reset_index(inplace=True)
sectorWiseDF.drop(['index'],axis=1,inplace=True)
##The below step is performed to add the primary essential function to  the dataframe.
sectorWiseDF=pd.merge(sectorWiseDF,dfProducts[['Product Name','Primary Essential Function']],on='Product Name',how='outer')                                                                                    

In [None]:
##Function to split longer titles
def splitTitle(title):
    mid=int(len(title)/2)
    return title[:mid]+'\n'+title[mid:]

temp=sectorWiseDF.groupby('Primary Essential Function').count().sort_values(by='Product Name',ascending=False)
fig=plt.figure(figsize=(30,20))
gs=fig.add_gridspec(5,5)
sectors=temp.index[:16] ##Has the names of the primary essential functions having products count above 10
count=0 ##To iterate the names of the sector in the for loop
for m in range(4):
    for n in range(4):
        x,y=m,n
        title=sectors[count]  ##Gets the name of the sector to be plotted
        ax=fig.add_subplot(gs[x,y])
        
        for s in ['top','right']:
            ax.spines[s].set_visible(False)
        
        ##For long titles we split with the function defined above.
        if(len(title)>40):
            title=splitTitle(title)
        ax.set_title(title,fontweight='bold',fontsize=15,va='center',ha='center')
        
        ##Plots the barplot with top 3 products in every sector
        df=sectorWiseDF[sectorWiseDF['Primary Essential Function']==sectors[count]]
        df.reset_index(inplace=True)
        img=sns.barplot(y='Product Name',x='No of Days',data=df[:3],ax=ax)
        
        ##To annontate the plot with no of days
        for u in range(3):
            days=str(int(df.loc[u]['No of Days']))+' Days'
            ax.text(0.5,u,days,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232')
            
        img.set(ylabel=None,xlabel=None,xticklabels=[])
        img.set_yticklabels(labels=ax.get_yticklabels(),fontsize=15,fontweight='medium') ##To improve the size of yticklabels.The get_ytick.. func gets the yticklabels from the ax subplot
        img.tick_params(left=False,bottom=False)
        
        count=count+1
        
##To improve spacing
fig.tight_layout()

# The below section depicts the products with the highest percentage access(pct_Access).
The products have been partitioned into three categories(products used in >50 districts, >10 and <50 districts, <10 districts). The pctAccess for a product for all the districts have been added and averaged.

In [None]:
districtDataSummary3=OrderedDict() ##This dictionary holds the most used products and the sum of their pct access(Summation of pct access)

productsUsedAlongWithPctAccessTop=OrderedDict() ##Takes the value from the previous dictionary and averages it. So that we can 
                                    ##get the mean pct_access can be across all districts.
productsUsedAlongWithPctAccessMiddle=OrderedDict()
productsUsedAlongWithPctAccessBottom=OrderedDict()

noOfDistricts2=OrderedDict()
    
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        pctAccessDF=pd.DataFrame(df.groupby('lp_id').mean().sort_values('pct_access',ascending=False)['pct_access'])
        pctAccessDF=pd.merge(pctAccessDF,dfProducts[['lp_id','Primary Essential Function','Product Name']],on='lp_id',how='inner')
        pctAccessDF.reset_index(inplace=True)
        for i in range(20):
            productName=pctAccessDF.loc[i]['Product Name']
            pctAccess=pctAccessDF.loc[i]['pct_access']
            if productName not in districtDataSummary3.keys():
                districtDataSummary3[productName]=pctAccess
            else:
                districtDataSummary3[productName]=districtDataSummary3[productName]+pctAccess
            if productName not in noOfDistricts2.keys():
                noOfDistricts2[productName]=1
            else:
                noOfDistricts2[productName]=noOfDistricts2[productName]+1
        #print('*****************')
        

for i,j in districtDataSummary3.items():
    if noOfDistricts2[i]>=50:
        productsUsedAlongWithPctAccessTop[i]=j/noOfDistricts2[i]
    elif noOfDistricts2[i]>10 and noOfDistricts2[i]<50:
        productsUsedAlongWithPctAccessMiddle[i]=j/noOfDistricts2[i]
    elif noOfDistricts2[i]<=10:
        productsUsedAlongWithPctAccessBottom[i]=j/noOfDistricts2[i]

In [None]:
productsUsedAlongWithPctAccessDFTop=pd.DataFrame(productsUsedAlongWithPctAccessTop,index=[0])
productsUsedAlongWithPctAccessDFTop=productsUsedAlongWithPctAccessDFTop.transpose()
productsUsedAlongWithPctAccessDFTop.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFTop.rename(columns={0:'PctAccess'},inplace=True)
productsUsedAlongWithPctAccessDFTop=productsUsedAlongWithPctAccessDFTop.sort_values(by='PctAccess',ascending=False)
productsUsedAlongWithPctAccessDFTop.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFTop.drop(['level_0'],axis=1,inplace=True)

productsUsedAlongWithPctAccessDFMiddle=pd.DataFrame(productsUsedAlongWithPctAccessMiddle,index=[0])
productsUsedAlongWithPctAccessDFMiddle=productsUsedAlongWithPctAccessDFMiddle.transpose()
productsUsedAlongWithPctAccessDFMiddle.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFMiddle.rename(columns={0:'PctAccess'},inplace=True)
productsUsedAlongWithPctAccessDFMiddle=productsUsedAlongWithPctAccessDFMiddle.sort_values(by='PctAccess',ascending=False)
productsUsedAlongWithPctAccessDFMiddle.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFMiddle.drop(['level_0'],axis=1,inplace=True)

productsUsedAlongWithPctAccessDFBottom=pd.DataFrame(productsUsedAlongWithPctAccessBottom,index=[0])
productsUsedAlongWithPctAccessDFBottom=productsUsedAlongWithPctAccessDFBottom.transpose()
productsUsedAlongWithPctAccessDFBottom.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFBottom.rename(columns={0:'PctAccess'},inplace=True)
productsUsedAlongWithPctAccessDFBottom=productsUsedAlongWithPctAccessDFBottom.sort_values(by='PctAccess',ascending=False)
productsUsedAlongWithPctAccessDFBottom.reset_index(inplace=True)
productsUsedAlongWithPctAccessDFBottom.drop(['level_0'],axis=1,inplace=True)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'The products with highest percentage access across all districts(Used in above 50 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=25,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='PctAccess',data=productsUsedAlongWithPctAccessDFTop[:20],ax=ax)
for i in range(20):
    pctAccess=str(round(productsUsedAlongWithPctAccessDFTop.loc[i]['PctAccess'],2))+'%'
    ax.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.text(0,21,'The no of days a particular product has been used across all districts is averaged',
        fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'The products with highest percentage access across all districts(Used in between 10 and 50 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=20,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='PctAccess',data=productsUsedAlongWithPctAccessDFMiddle[:20],ax=ax)
for i in range(20):
    pctAccess=str(round(productsUsedAlongWithPctAccessDFMiddle.loc[i]['PctAccess'],2))+'%'
    ax.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'The products with highest percentage access across all districts(Used in less than 10 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=20,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='PctAccess',data=productsUsedAlongWithPctAccessDFBottom[:20],ax=ax)
for i in range(20):
    pctAccess=str(round(productsUsedAlongWithPctAccessDFBottom.loc[i]['PctAccess'],2))+'%'
    ax.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

# The below section depicts the products with the highest engagement index.
The products have been partitioned into three categories(products used in >50 districts, >10 and <50 districts, <10 districts). The engagement index for a product for all the districts have been added and averaged. Only top 20 products(Based on enagement index) from every district has been considered.

In [None]:
districtDataSummary4=OrderedDict() ##This dictionary holds the products and the sum of their engagement index(Summation of enagagement index)

productsUsedAlongWithEngagementIndexTop=OrderedDict() ##Takes the value from the previous dictionary and averages it. So that we can 
                                    ##get the mean engagement index across all districts.
productsUsedAlongWithEngagementIndexMiddle=OrderedDict()
productsUsedAlongWithEngagementIndexBottom=OrderedDict()

noOfDistricts3=OrderedDict()
    
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        engagementIndexDF=pd.DataFrame(df.groupby('lp_id').mean().sort_values('engagement_index',ascending=False)['engagement_index'])
        engagementIndexDF=pd.merge(engagementIndexDF,dfProducts[['lp_id','Primary Essential Function','Product Name']],on='lp_id',how='inner')
        engagementIndexDF.reset_index(inplace=True)
        for i in range(20):
            productName=engagementIndexDF.loc[i]['Product Name']
            engagement=engagementIndexDF.loc[i]['engagement_index']
            if productName not in districtDataSummary4.keys():
                districtDataSummary4[productName]=engagement
            else:
                districtDataSummary4[productName]=districtDataSummary4[productName]+engagement
            if productName not in noOfDistricts3.keys():
                noOfDistricts3[productName]=1
            else:
                noOfDistricts3[productName]=noOfDistricts3[productName]+1
        #print('*****************')
        

for i,j in districtDataSummary4.items():
    if noOfDistricts3[i]>=50:
        productsUsedAlongWithEngagementIndexTop[i]=j/noOfDistricts3[i]
    elif noOfDistricts3[i]>10 and noOfDistricts3[i]<50:
        productsUsedAlongWithEngagementIndexMiddle[i]=j/noOfDistricts3[i]
    elif noOfDistricts3[i]<=10:
        productsUsedAlongWithEngagementIndexBottom[i]=j/noOfDistricts3[i]

In [None]:
productsUsedAlongWithEngagementIndexDFTop=pd.DataFrame(productsUsedAlongWithEngagementIndexTop,index=[0])
productsUsedAlongWithEngagementIndexDFTop=productsUsedAlongWithEngagementIndexDFTop.transpose()
productsUsedAlongWithEngagementIndexDFTop.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFTop.rename(columns={0:'EngagementIndex'},inplace=True)
productsUsedAlongWithEngagementIndexDFTop=productsUsedAlongWithEngagementIndexDFTop.sort_values(by='EngagementIndex',ascending=False)
productsUsedAlongWithEngagementIndexDFTop.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFTop.drop(['level_0'],axis=1,inplace=True)

productsUsedAlongWithEngagementIndexDFMiddle=pd.DataFrame(productsUsedAlongWithEngagementIndexMiddle,index=[0])
productsUsedAlongWithEngagementIndexDFMiddle=productsUsedAlongWithEngagementIndexDFMiddle.transpose()
productsUsedAlongWithEngagementIndexDFMiddle.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFMiddle.rename(columns={0:'EngagementIndex'},inplace=True)
productsUsedAlongWithEngagementIndexDFMiddle=productsUsedAlongWithEngagementIndexDFMiddle.sort_values(by='EngagementIndex',ascending=False)
productsUsedAlongWithEngagementIndexDFMiddle.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFMiddle.drop(['level_0'],axis=1,inplace=True)

productsUsedAlongWithEngagementIndexDFBottom=pd.DataFrame(productsUsedAlongWithEngagementIndexBottom,index=[0])
productsUsedAlongWithEngagementIndexDFBottom=productsUsedAlongWithEngagementIndexDFBottom.transpose()
productsUsedAlongWithEngagementIndexDFBottom.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFBottom.rename(columns={0:'EngagementIndex'},inplace=True)
productsUsedAlongWithEngagementIndexDFBottom=productsUsedAlongWithEngagementIndexDFBottom.sort_values(by='EngagementIndex',ascending=False)
productsUsedAlongWithEngagementIndexDFBottom.reset_index(inplace=True)
productsUsedAlongWithEngagementIndexDFBottom.drop(['level_0'],axis=1,inplace=True)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'Products with the highest engagement index access across all districts(Used in above 50 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=25,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='EngagementIndex',data=productsUsedAlongWithEngagementIndexDFTop[:20],ax=ax)
for i in range(20):
    pctAccess=str(round(productsUsedAlongWithEngagementIndexDFTop.loc[i]['EngagementIndex'],2))+'%'
    ax.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.text(0,21,'The no of days a particular product has been used across all districts is averaged',
        fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'The products with highest engagement index across all districts(Used in between 10 and 50 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=20,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='EngagementIndex',data=productsUsedAlongWithEngagementIndexDFMiddle[:20],ax=ax)
for i in range(20):
    pctAccess=str(round(productsUsedAlongWithEngagementIndexDFMiddle.loc[i]['EngagementIndex'],2))+'%'
    ax.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(1,1)
ax=fig.add_subplot(gs[0,0])
for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
ax.text(0,-1.5,'The products with highest engagement index across all districts(Used in less than 10 districts)',fontfamily='sans-serif',
                     ha='center',fontsize=20,fontweight='bold',color='#323232')

img=sns.barplot(y='index',x='EngagementIndex',data=productsUsedAlongWithEngagementIndexDFBottom[:20],ax=ax)
for i in range(20):
    engagementIndex=str(round(productsUsedAlongWithEngagementIndexDFBottom.loc[i]['EngagementIndex'],2))+'%'
    ax.text(0,i,engagementIndex,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
img.set(ylabel=None,xticklabels=[],xlabel=None)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='medium')
img.tick_params(left=False,bottom=False)

# The below plots shows a summary of features(engagement index,pct access..) for the locales(Suburb,Rural,City,Town) 

In [None]:
localeData=OrderedDict()
for locale in ['Suburb','Rural','City','Town']:
    engagement_index=0
    pct_access=0
    count=0
    n_products=0
    for i in dfDistrict[dfDistrict['locale']==locale]['district_id']:
        df=load('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(i)+'.csv')
        n_products=n_products+df['lp_id'].nunique()
        engagement_index=engagement_index+df['engagement_index'].mean()
        pct_access=pct_access+df['pct_access'].mean()
        count=count+1
    localeData.update({locale:{'engagement_index':round(engagement_index/count,2),'pct_access':round(pct_access/count,2),'No of Districts':int(count),
                               'No of Products used':n_products,'No of Products used per district(Avg)':int(n_products/count)}})
localeDataDF=pd.DataFrame(localeData).transpose()
localeDataDF.reset_index(inplace=True)

In [None]:
fig=plt.figure(figsize=(10,7))
gs=fig.add_gridspec(2,2)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[0,1])
ax3=fig.add_subplot(gs[1,0])
ax4=fig.add_subplot(gs[1,1])

sns.barplot(y='index',x='No of Districts',data=localeDataDF,ax=ax1,palette='Blues')
sns.barplot(y='index',x='engagement_index',data=localeDataDF,ax=ax2,palette='Blues')
sns.barplot(y='index',x='pct_access',data=localeDataDF,ax=ax3,palette='Blues')
sns.barplot(y='index',x='No of Products used per district(Avg)',data=localeDataDF,ax=ax4,palette='Blues')   

for s in ['top','right']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
ax1.tick_params(left=False,bottom=False)
ax1.set(xticklabels=[],xlabel=None,ylabel=None)
ax2.tick_params(left=False,bottom=False)
ax2.set(xticklabels=[],xlabel=None,ylabel=None)
ax3.tick_params(left=False,bottom=False)
ax3.set(xticklabels=[],xlabel=None,ylabel=None)
ax4.tick_params(left=False,bottom=False)
ax4.set(xticklabels=[],xlabel=None,ylabel=None)
ax1.set_title('No of Districts',fontweight='bold',fontsize=15,va='center',ha='center')
ax2.set_title('Engagement Index',fontweight='bold',fontsize=15,va='center',ha='center')
ax3.set_title('Percentage Access',fontweight='bold',fontsize=15,va='center',ha='center')
ax4.set_title('No of Products used per district(Avg)',fontweight='bold',fontsize=15,va='center',ha='center')

for i in range(len(localeDataDF)):
    ax1.text(0,i,str(localeDataDF.loc[i]['No of Districts'])+' districts',fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
    ax2.text(0,i,str(localeDataDF.loc[i]['engagement_index'])+'%',fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
    ax3.text(0,i,str(localeDataDF.loc[i]['pct_access'])+'%',fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')
    ax4.text(0,i,str(localeDataDF.loc[i]['No of Products used per district(Avg)'])+' products',fontfamily='sans-serif',fontsize=15,fontweight='light',color='#323232')

fig.tight_layout()

In [None]:
# pieData=localeDataDF['No of Products used per district(Avg)'].reset_index()
# pieData['Locale']=localeDataDF['index'].values
# pieData
# fig = px.pie(pieData, values = 'No of Products used per district(Avg)', 
#              names = 'Locale', width = 500, height = 500,title='The no of Products used in <br> each district of a region <br> in average')
# fig.update_traces(textposition = 'inside', 
#                   textinfo = 'percent + label', 
#                   marker = dict(colors = ['lightcyan','cyan','royalblue','darkblue'], line = dict(color = 'white', width = 2)))

# fig.update_layout(font=dict(
#         family="Courier New, monospace",
#         size=18,
#         color="RebeccaPurple"
#     ),showlegend = False)
# fig.show()

In [None]:
# fig = px.pie(dfDistrict['locale'].value_counts().reset_index().rename(columns = {'locale': 'count'}), values = 'count', 
#              names = 'index', width = 500, height = 500,title='The no of Districts <br>in each region')

# fig.update_traces(textposition = 'inside', 
#                   textinfo = 'percent + label', 
#                   marker = dict(colors = ['lightcyan','cyan','royalblue','darkblue'], line = dict(color = 'white', width = 2)))

# fig.update_layout(font=dict(
#         family="Courier New, monospace",
#         size=18,
#         color="RebeccaPurple"
#     ),showlegend = False)
                  
# fig.show()

# The below section plots the Engagement index and Percentage access data for all days of 2020 for different locales.

In the below plot both percentage access and engagement index has been at higher levels. But after the onset of first Covid-19(March 2020) there has been significant reduction in usage across all locales. During June2020-Aug2020 there is very low access due to summer vacation. But after September the situation is restored and all regions are back to their pre pandemic level percentage access and engagement data.

Another point to be noted is City based engagement index and pct access is lower compared to the Rural based engagement index and percentage access.

In terms of percentage access of students, the Suburb region has seen increase in levels after the pandemic whereas the percentage access has slightly gone down in Rural areas.

There is an increase in engagement index for all regions except Town where it remains in the same range.
(Suburb region has higher pct access and engagement index due to higher no of districts)


In [None]:
index=0 ##For creating keys in timeAndLocaleWiseDat dictionary
timeAndLocaleWiseData=OrderedDict()
for locale in ['Suburb','Rural','City','Town']:
    engagement_index=0
    pct_access=0
    count=0
    n_products=0
    for i in dfDistrict[dfDistrict['locale']==locale]['district_id']:
        df=load('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(i)+'.csv')
        df=df.groupby('time').mean()
        df.reset_index(inplace=True)
        timeWiseData=OrderedDict()
        for j in range(len(df)):
            date=df.loc[j]['time']
            engagement_index=df.loc[j]['engagement_index']
            pct_access=df.loc[j]['pct_access']
            if date not in timeWiseData:
                timeWiseData[date]={'engagement_index':engagement_index,'pct_access':pct_access}
            else:
                timeWiseData[date]['engagement_index']=(timeWiseData[date]['engagement_index']+engagement_index)/2
                timeWiseData[date]['pct_access']=(timeWiseData[date]['pct_access']+pct_access)/2
    for m in sorted(list(timeWiseData.keys())):
        a=timeWiseData[m]['engagement_index']
        b=timeWiseData[m]['pct_access']
        timeAndLocaleWiseData.update({index:{'date':m,'locale':locale,'engagement_index':a,'pct_access':b}})
        index=index+1
timeAndLocaleWiseDataDF=pd.DataFrame(timeAndLocaleWiseData).transpose()
timeAndLocaleWiseDataDF['date']=pd.to_datetime(timeAndLocaleWiseDataDF['date'])
timeAndLocaleWiseDataDF['engagement_index']=timeAndLocaleWiseDataDF['engagement_index'].astype('float')
timeAndLocaleWiseDataDF['pct_access']=timeAndLocaleWiseDataDF['pct_access'].astype('float')

In [None]:
fig=plt.figure(figsize=(15,10))
gs=fig.add_gridspec(2,1)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[1,0])

for s in ['top','right']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    
sns.lineplot(x='date',y='engagement_index',data=timeAndLocaleWiseDataDF,hue='locale',palette='bright',ax=ax1)
sns.lineplot(x='date',y='pct_access',data=timeAndLocaleWiseDataDF,hue='locale',palette='bright',ax=ax2)

ax1.tick_params(left=False,bottom=False)
ax1.set(ylabel='Engagement Index',xlabel='Date')
ax1.set_xlabel(ax1.get_xlabel(),fontsize=15,fontweight='bold')
ax1.set_ylabel(ax1.get_ylabel(),fontsize=15,fontweight='bold')

ax2.tick_params(left=False,bottom=False)
ax2.set(xlabel='Date',ylabel='Percentage Access')
ax2.set_xlabel(ax2.get_xlabel(),fontsize=15,fontweight='bold')
ax2.set_ylabel(ax2.get_ylabel(),fontsize=15,fontweight='bold')

# The below section plots the percentage access and engagement index state wise
Only states having more than 2 districts has been considered. Districts having less than 10000 records have also not been considered

In [None]:
##Some disricts have less data available. So I removed them from consideration
dfDistrictDuplicate=dfDistrict
for i in dfDistrictDuplicate['district_id'].unique():
    df=load('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(i)+'.csv')
    if len(df)<10000:
        index=dfDistrictDuplicate[dfDistrictDuplicate['district_id']==i].index[0]
        dfDistrictDuplicate.drop(index,axis=0,inplace=True)

In [None]:
states=np.delete(dfDistrictDuplicate['state'].unique(),1,axis=0) ##To remove nan I have used delete operation.NaN is at 1st index in array

In [None]:
stateWiseData=OrderedDict()
for state in states:
    district_ids=dfDistrictDuplicate[dfDistrictDuplicate['state']==state]['district_id'].values
    for i in district_ids:
        df=load('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(i)+'.csv')
        pct_access=df['pct_access'].mean()
        engagement_index=df['engagement_index'].mean()
    stateWiseData.update({state:{'engagement_index':round(engagement_index/len(district_ids),2),
                                 'pct_access':round(pct_access/len(district_ids),2),
                                'No of Districts': int(len(district_ids))}})
    
stateWiseDataDF=pd.DataFrame(stateWiseData).transpose()
stateWiseDataDF.reset_index(inplace=True)
stateWiseDataDF.rename(columns={'index':'State'},inplace=True)


In [None]:
fig=plt.figure(figsize=(10,7))
gs=fig.add_gridspec(1,2)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[0,1])
for s in ['top','right','left']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
sns.barplot(y='State',x='engagement_index',data=stateWiseDataDF[stateWiseDataDF['No of Districts']>2],ax=ax1)
sns.barplot(y='State',x='pct_access',data=stateWiseDataDF[stateWiseDataDF['No of Districts']>2],ax=ax2)

ax1.tick_params(left=False,bottom=False)
ax1.set(xlabel='Engagement Index',xticklabels=[])
ax1.set_xlabel(ax1.get_xlabel(),fontsize=15,fontweight='bold')
ax1.set_ylabel(ax1.get_ylabel(),fontsize=15,fontweight='bold')

ax2.tick_params(left=False,bottom=False)
ax2.set(xlabel='Percentage Access',ylabel=None,xticklabels=[])
ax2.set_xlabel(ax2.get_xlabel(),fontsize=15,fontweight='bold')

for i in range(14):
    engagementIndex=str(stateWiseDataDF.loc[i]['engagement_index'])+'%'
    pctAccess=str(stateWiseDataDF.loc[i]['pct_access'])+'%'
    districts=str(int(stateWiseDataDF.loc[i]['No of Districts']))+' districts'
    ax1.text(0,i,engagementIndex,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
    ax2.text(0,i,pctAccess,fontfamily='sans-serif',
                      fontsize=15,fontweight='bold',color='#323232',va='center')
    ax2.text(0.2,i,districts,fontfamily='sans-serif',
                      fontsize=10,fontweight='medium',color='#323232',va='center')

fig.tight_layout()

# The below set of plots shows the top product offerings by Google and Microsoft specifically and also products of other providers with their engagement index, pct access and no of districts they are used in.

Google and Microsoft have the most no of product offerings. So these two providers have been given special coverage.

In [None]:
googleProductsSummary=OrderedDict()
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        df=pd.merge(df,dfProducts[['lp_id','Provider/Company Name','Product Name']],on='lp_id',how='inner')
        products=df[df['Provider/Company Name']=='Google LLC']['Product Name'].unique()
        for i in products:
            engagement_index=df[df['Product Name']==i]['engagement_index'].mean()
            pct_access=df[df['Product Name']==i]['pct_access'].mean()
            if i in googleProductsSummary.keys():
                googleProductsSummary[i]['engagement_index']=(googleProductsSummary[i]['engagement_index']+engagement_index)/2
                googleProductsSummary[i]['pct_access']=(googleProductsSummary[i]['pct_access']+pct_access)/2
                googleProductsSummary[i]['count']=googleProductsSummary[i]['count']+1
            else:
                googleProductsSummary.update({i:{'engagement_index':engagement_index,'pct_access':pct_access,'count':1}})

In [None]:
fig=plt.figure(figsize=(20,10))
gs=fig.add_gridspec(1,3)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[0,1])
ax3=fig.add_subplot(gs[0,2])

ax2.text(0,-1,'Summary of Google Products across all districts',fontweight='bold',fontsize=20,fontfamily='sans-serif',ha='center')

googleEngagementIndexDF=pd.DataFrame(googleProductsSummary).transpose().sort_values(by='engagement_index',ascending=False)
googleEngagementIndexDF.reset_index(inplace=True)
sns.barplot(y='index',x='engagement_index',data=googleEngagementIndexDF,ax=ax1)
ax1.tick_params(left=None,bottom=None)
ax1.set(xticklabels=[])
ax1.set_ylabel('Products',fontsize=15,fontweight='bold')
ax1.set_xlabel('Engagement Index',fontsize=15,fontweight='bold')
ax1.set_xticklabels(ax1.get_xticklabels(),fontsize=15,fontweight='medium')

googlePctAccessDF=pd.DataFrame(googleProductsSummary).transpose().sort_values(by='pct_access',ascending=False)
googlePctAccessDF.reset_index(inplace=True)
sns.barplot(y='index',x='pct_access',data=googlePctAccessDF,ax=ax2)
ax2.tick_params(left=None,bottom=None)
ax2.set(xticklabels=[],ylabel=None)
ax2.set_xlabel('Percentage Access among Students',fontsize=15,fontweight='bold')

googleDistrictCountDF=pd.DataFrame(googleProductsSummary).transpose().sort_values(by='count',ascending=False)
googleDistrictCountDF.reset_index(inplace=True)
sns.barplot(y='index',x='count',data=googleDistrictCountDF,ax=ax3)
ax3.tick_params(left=None,bottom=None)
ax3.set(xticklabels=[],ylabel=None)
ax3.set_xlabel('No of districts using the product',fontsize=15,fontweight='bold')

for s in ['top','right','left']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    
for i in range(29):
    ax1.text(10,i,str(round(googleEngagementIndexDF.loc[i]['engagement_index'],2))+'%',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')
    ax2.text(1,i,str(round(googlePctAccessDF.loc[i]['pct_access'],2))+'%',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')
    ax3.text(5,i,str(int(googleDistrictCountDF.loc[i]['count']))+' districts',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')

fig.tight_layout()

> **Google products are used in almost all districts but only top 6-7 products have good engagement index and percentage access.**

In [None]:
microsoftProductsSummary=OrderedDict()
n=233 ##No of distrcts to be considered. In total there are 233 districts

for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        df=pd.merge(df,dfProducts[['lp_id','Provider/Company Name','Product Name']],on='lp_id',how='inner')
        products=df[df['Provider/Company Name']=='Microsoft']['Product Name'].unique()
        for i in products:
            engagement_index=df[df['Product Name']==i]['engagement_index'].mean()
            pct_access=df[df['Product Name']==i]['pct_access'].mean()
            if i in microsoftProductsSummary.keys():
                microsoftProductsSummary[i]['engagement_index']=(microsoftProductsSummary[i]['engagement_index']+engagement_index)/2
                microsoftProductsSummary[i]['pct_access']=(microsoftProductsSummary[i]['pct_access']+pct_access)/2
                microsoftProductsSummary[i]['count']=microsoftProductsSummary[i]['count']+1
            else:
                microsoftProductsSummary.update({i:{'engagement_index':engagement_index,'pct_access':pct_access,'count':1}})

In [None]:
fig=plt.figure(figsize=(20,10))
gs=fig.add_gridspec(1,3)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[0,1])
ax3=fig.add_subplot(gs[0,2])

ax2.text(0,-1,'Summary of Microsft Products across all districts',fontweight='bold',fontsize=23,fontfamily='sans-serif',ha='center')

microsoftEngagementIndexDF=pd.DataFrame(microsoftProductsSummary).transpose().sort_values(by='engagement_index',ascending=False)
microsoftEngagementIndexDF.reset_index(inplace=True)
sns.barplot(y='index',x='engagement_index',data=microsoftEngagementIndexDF,ax=ax1)
ax1.tick_params(left=None,bottom=None)
ax1.set(xticklabels=[],ylabel=None)
ax1.set_xlabel('Engagement Index',fontsize=15,fontweight='bold')
ax1.set_yticklabels(ax1.get_yticklabels(),fontsize=15,fontweight='medium')

microsoftPctAccessDF=pd.DataFrame(microsoftProductsSummary).transpose().sort_values(by='pct_access',ascending=False)
microsoftPctAccessDF.reset_index(inplace=True)
sns.barplot(y='index',x='pct_access',data=microsoftPctAccessDF,ax=ax2)
ax2.tick_params(left=None,bottom=None)
ax2.set(xticklabels=[],ylabel=None)
ax2.set_xlabel('Percentage Access among Students',fontsize=15,fontweight='bold')
ax2.set_yticklabels(ax2.get_yticklabels(),fontsize=15,fontweight='medium')

microsoftDistrictCountDF=pd.DataFrame(microsoftProductsSummary).transpose().sort_values(by='count',ascending=False)
microsoftDistrictCountDF.reset_index(inplace=True)
sns.barplot(y='index',x='count',data=microsoftDistrictCountDF,ax=ax3)
ax3.tick_params(left=None,bottom=None)
ax3.set(xticklabels=[],ylabel=None)
ax3.set_xlabel('No of districts using the product',fontsize=15,fontweight='bold')
ax3.set_yticklabels(ax3.get_yticklabels(),fontsize=15,fontweight='medium')

for s in ['top','right']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    
for i in range(6):
    ax1.text(0,i,str(round(microsoftEngagementIndexDF.loc[i]['engagement_index'],2))+'%',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')
    ax2.text(0,i,str(round(microsoftPctAccessDF.loc[i]['pct_access'],2))+'%',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')
    ax3.text(0,i,str(int(microsoftDistrictCountDF.loc[i]['count']))+' districts',fontsize=15,fontweight='medium',fontfamily='sans-serif',va='center')

#ax1.text(0,32,'Google products are used in almost all districts but only top 6-7 products have good engagement index and percentage access',fontweight='medium',fontsize=15,fontfamily='sans-serif')
fig.tight_layout()

In [None]:
providerProductsSummary=OrderedDict()
n=233 ##No of distrcts to be considered. In total there are 233 districts
iter_val=0
for dir,_,fil in os.walk('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    for f in fil[:n]:
        link=os.path.join(dir,f)
        df=load(link)
        df=pd.merge(df,dfProducts[['lp_id','Provider/Company Name','Product Name']],on='lp_id',how='inner')
        products=df[~df['Provider/Company Name'].str.contains('Google LLC|Microsoft')]['Product Name'].unique()
        for i in products:
            engagement_index=df[df['Product Name']==i]['engagement_index'].mean()
            pct_access=df[df['Product Name']==i]['pct_access'].mean()
            if i in dfProducts['Product Name'].values:
                provider=dfProducts[dfProducts['Product Name']==i]['Provider/Company Name'].values[0]
            else:
                provider=i
            providerProductsSummary.update({iter_val:{'Product':i,'engagement_index':engagement_index,'pct_access':pct_access,
                                                   'Provider':provider}})
            iter_val=iter_val+1

In [None]:
providerProductsSummaryDF=pd.DataFrame(providerProductsSummary).transpose()
providerProductsSummaryDF['engagement_index']=providerProductsSummaryDF['engagement_index'].astype('float')
providerProductsSummaryDF['pct_access']=providerProductsSummaryDF['pct_access'].astype('float')

In [None]:
fig=plt.figure(figsize=(20,10))
gs=fig.add_gridspec(1,2)
ax1=fig.add_subplot(gs[0,0])
ax2=fig.add_subplot(gs[0,1])

ax2.text(0,-1,'Summary of Product Providers other than Google and Microsoft',fontweight='bold',fontsize=20,fontfamily='sans-serif',ha='center')

providerProductsSummaryEngIndexDF=providerProductsSummaryDF.groupby('Provider').mean().sort_values(by='engagement_index',ascending=False)
providerProductsSummaryEngIndexDF.reset_index(inplace=True)
sns.barplot(y='Provider',x='engagement_index',data=providerProductsSummaryEngIndexDF.head(10),ax=ax1,palette='rainbow')
ax1.tick_params(left=None,bottom=None)
ax1.set(xticklabels=[],ylabel=None)
ax1.set_xlabel('Engagement Index',fontsize=15,fontweight='bold')
ax1.set_yticklabels(ax1.get_yticklabels(),fontsize=16,fontweight='medium')

providerProductsSummaryPctAccessDF=providerProductsSummaryDF.groupby('Provider').mean().sort_values(by='pct_access',ascending=False)
providerProductsSummaryPctAccessDF.reset_index(inplace=True)
sns.barplot(y='Provider',x='pct_access',data=providerProductsSummaryPctAccessDF.head(10),ax=ax2,palette='rainbow')
ax2.tick_params(left=None,bottom=None)
ax2.set(xticklabels=[],ylabel=None)
ax2.set_xlabel('Percentage Access among Students',fontsize=15,fontweight='bold')
ax2.set_yticklabels(ax2.get_yticklabels(),fontsize=16,fontweight='medium')

for s in ['top','right','left']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    
for i in range(10):
    ax1.text(0,i,str(round(providerProductsSummaryEngIndexDF.loc[i]['engagement_index'],2))+' %',fontsize=15,fontweight='medium')
    ax2.text(0,i,str(round(providerProductsSummaryPctAccessDF.loc[i]['pct_access'],2))+' %',fontsize=15,fontweight='medium')

# The below section plots the most used products in each state based on engagement index and percentage access
The sum of the engagement index and pct access for a product has been averaged over the no of districts(in a state) that particular product has been used in.

In [None]:
stateWiseProductData=OrderedDict()
n=0
for state in states:
    district_ids=dfDistrictDuplicate[dfDistrictDuplicate['state']==state]['district_id'].values
    for i in district_ids:
        df=load('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(i)+'.csv')
        df=pd.merge(df,dfProducts[['Product Name','lp_id']],on='lp_id',how='inner')
        for j in df['Product Name'].unique():
            pct_access=df[df['Product Name']==j]['pct_access'].mean()
            engagement_index=df[df['Product Name']==j]['engagement_index'].mean()
            stateWiseProductData.update({n:{'state':state,'district':i,'Product':j,'engagement_index':round(engagement_index,2),
                                 'pct_access':round(pct_access,2)}})
            n=n+1
    
stateWiseProductDataDF=pd.DataFrame(stateWiseProductData).transpose()
stateWiseProductDataDF['engagement_index']=stateWiseProductDataDF['engagement_index'].astype('float')
stateWiseProductDataDF['pct_access']=stateWiseProductDataDF['pct_access'].astype('float')

# Products with highest engagement index in each state

In [None]:
fig=plt.figure(figsize=(35,35))
gs=fig.add_gridspec(6,6)
n=0
for i in range(5):
    for j in range(5):
        ax=fig.add_subplot(gs[i,j])
        DF=stateWiseProductDataDF[stateWiseProductDataDF['state']==states[n]]
        DF1=DF.groupby('Product').mean().sort_values('engagement_index',ascending=False).reset_index().head(5)
        DF2=DF.groupby('Product').count().sort_values('district',ascending=False).reset_index()
        sns.barplot(y='Product',x='engagement_index',data=DF1,ax=ax,palette='rainbow')
        for k in range(len(DF1)):
            noOfDistricts=DF2[DF2['Product']==DF1.loc[i]['Product']]['district'].values[0]
            ax.text(0,k,str(round(DF1.loc[k]['engagement_index'],2))+' ('+str(noOfDistricts)+' districts)',fontsize=20)
        for s in ['top','right','left']:
            ax.spines[s].set_visible(False)
        ax.set_title(states[n],fontsize=30,fontweight='bold')
        n=n+1
        ax.tick_params(left=None,bottom=None)
        ax.set(xticklabels=[],ylabel=None,xlabel=None)
        ax.set_yticklabels(ax.get_yticklabels(),fontsize=20,fontweight='medium')
        if n==22:
            break
    if n==22:
        break

fig.tight_layout()

# Products with highest percentage access in each state

In [None]:
fig=plt.figure(figsize=(35,35))
gs=fig.add_gridspec(6,6)
n=0
for i in range(5):
    for j in range(5):
        ax=fig.add_subplot(gs[i,j])
        DF=stateWiseProductDataDF[stateWiseProductDataDF['state']==states[n]]
        DF1=DF.groupby('Product').mean().sort_values('pct_access',ascending=False).reset_index().head(5)
        DF2=DF.groupby('Product').count().sort_values('district',ascending=False).reset_index()
        sns.barplot(y='Product',x='pct_access',data=DF1,ax=ax,palette='rainbow')
        for k in range(len(DF1)):
            noOfDistricts=DF2[DF2['Product']==DF1.loc[i]['Product']]['district'].values[0]
            ax.text(0,k,str(round(DF1.loc[k]['pct_access'],2))+' ('+str(noOfDistricts)+' districts)',fontsize=20)
        for s in ['top','right','left']:
            ax.spines[s].set_visible(False)
        ax.set_title(states[n],fontsize=25,fontweight='bold')
        n=n+1
        ax.tick_params(left=None,bottom=None)
        ax.set(xticklabels=[],ylabel=None,xlabel=None)
        ax.set_yticklabels(ax.get_yticklabels(),fontsize=20,fontweight='medium')
        if n==22:
            break
    if n==22:
        break
fig.tight_layout()


> **From the above two plots we could see the monopoly that Google and its products enjoy. Google products could be the most seen name in the below plots. But such is their quality and the ecosystem they have created to thrive. Other names that could be seen in Schoology, Kahoot, Zoom**