In [None]:
# bokeh imports
from bokeh.io import output_notebook, show, reset_output
import bokeh
from bokeh.plotting import figure

# other imports
import numpy as np


# Data Cleaning

In [None]:
import pandas as pd

'''Engagement Data'''
# importing os module 
import os
  
# # Get the list of all files and directories in the root directory
# path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
# dir_list = os.listdir(path)

#Creating empty list that will take in transformed files.Transformed files will have district names in all of them.
#later this list will be used for concatenating dfs
Files=[]

for dirname, _, filenames in os.walk('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'):
    for filename in filenames:
        path=os.path.join(dirname, filename)
        file_=pd.read_csv(path)
        file_['district_id']=filename.split('.')[0].strip()
        Files.append(file_)
#Loop to write district id in all files
# for file in dir_list:
#     file_=pd.read_csv(path)
#     file_['district_id']=file.split('.')[0].strip()
#     Files.append(file_)
    
#Concatenating all Dfs
df=pd.concat(Files,ignore_index=True)

#Dropping nulls
df_1=df.loc[(df['pct_access'].isnull()==False)
            &(df['engagement_index'].isnull()==False)
            &(df['lp_id'].isnull()==False)
           ]

#Importing districts info
dis=pd.read_csv(r"../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
df_1['district_id']=df_1['district_id'].astype(str).astype(int)
df_2=df_1.merge(dis,how='left',on='district_id')

#Importing products info
prod=pd.read_csv(r"../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
prod.rename(columns={'LP ID':'lp_id'},inplace=True)
df_3=df_2.merge(prod,how='left',on='lp_id')


#Dropping nulls
df_4=df_3.loc[(df_3['state'].isnull()==False)]
df_4.rename(columns={'Sector(s)':'Sectors'},inplace=True)
df_4=df_4.loc[(df_4['pct_free/reduced'].isnull()==False)&(df_4['Sectors'].isnull()==False)]

#Changing datatype
df_5=df_4
df_5['time']=pd.to_datetime(df_5['time'])   
df_5.reset_index(drop=True)

#Separating main and sub categories
import re
df_5['Primary Essential Function-Main Category']=df_5['Primary Essential Function'].apply(lambda x: x[:9].strip() if re.match('LC/CM/SDO',x) else x[:2].strip())
df_5['Primary Essential Function-Sub Category']=df_5['Primary Essential Function'].apply(lambda x: x[-len(x)+12:].strip() if re.match('LC/CM/SDO',x) else x[-len(x)+5:].strip())

#Rearranging columns
df_6=df_5[['time', 'lp_id', 'pct_access', 'engagement_index', 'district_id',
       'state', 'locale', 'pct_black/hispanic', 'pct_free/reduced',
       'county_connections_ratio', 'pp_total_raw', 'Product Name',
       'Provider/Company Name', 'Sectors',
       'Primary Essential Function-Main Category','Primary Essential Function-Sub Category']]

# Cleaning pct_ columns
df_6['pct_black/hispanic']=df_6['pct_black/hispanic'].str.replace('[','')
df_6['pct_free/reduced']=df_6['pct_free/reduced'].str.replace('[','')
df_6['pp_total_raw']=df_6['pp_total_raw'].str.replace('[','')

#hispanic/black--getting mean value from mentioned percentage class
df_6['pct_black/hispanic_lower']=df_6['pct_black/hispanic'].apply(lambda x:x.split(',')[0])
df_6['pct_black/hispanic_higher']=df_6['pct_black/hispanic'].apply(lambda x:x.split(',')[1])

df_6['pct_black/hispanic_lower']=df_6['pct_black/hispanic_lower'].astype('float')
df_6['pct_black/hispanic_higher']=df_6['pct_black/hispanic_higher'].astype('float')

df_6['pct_black/hispanic']=df_6[['pct_black/hispanic_lower','pct_black/hispanic_higher']].mean(axis=1)

df_6['pct_black/hispanic']=df_6['pct_black/hispanic']*100

#expenses by company--getting mean value from mentioned percentage class
df_6['pp_total_raw']=df_6['pp_total_raw'].astype('str')
df_6['pp_total_raw_lower']=df_6['pp_total_raw'].apply(lambda x:x.split(',')[0])#.unique()
df_6['pp_total_raw_higher']=df_6['pp_total_raw'].apply(lambda x:x.split(',')[1] if x!='nan' else 'nan')
df_6['pp_total_raw_lower']=df_6['pp_total_raw_lower'].astype('float')
df_6['pp_total_raw_higher']=df_6['pp_total_raw_higher'].astype('float')

df_6['pp_total_raw']=df_6[['pp_total_raw_lower','pp_total_raw_higher']].mean(axis=1)

#We will drop county_connections_ratio because it has one value only to describe internet connectivity by state and 
#also we'll ignore all the columns not required in analysis


In this report, approach of analysis is to answer some questions regarding kids behaviour on digital learning that will give some additional insights on the data
<br>**Contents of report:**
<br>**1.What is the picture of digital connectivity and engagement in 2020
<br>2.What is the effect of COVID-19 pandemic on online learning
<br>3.How did students engage with different education Technologies
<br>4.Companies having maximum engagement
<br>5.Demographic analysis of pct_access and Engagement Index 
<br>6.Socio Economic Impact on Education
<br>7.Financial Analysis
<br>Summary
<br>Possible Next Steps
<br> Web Links of external data used in this notebook**

# 1.	What is the picture of digital connectivity and engagement in 2020? 

In [None]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
#Converting data type
df_6.time=pd.to_datetime(df_6.time,infer_datetime_format = True, utc = True).astype('datetime64[ns]')

#Adding month column
df_6['month']=pd.DatetimeIndex(df_6['time']).month

#Group by month
monthly_df=df_6[['month','engagement_index','pct_access']].groupby(['month']).mean().reset_index()

from bokeh.models import HoverTool
from bokeh.models import LinearAxis, Range1d

# line plot - multiple lines
output_notebook()

# data
x = monthly_df.month
y1 = monthly_df.engagement_index
y2 = monthly_df.pct_access

# plot 
plot = figure(width=800,height=500)
plot.line(x, y1, color='red',legend_label='Engagement Index')
plot.circle(x, y1, color='red')
plot.yaxis.axis_label = 'Engagement Index'

#Adding second y axis
plot.extra_y_ranges = {"y2": Range1d(start = 0.02, end = 3)}
plot.add_layout(LinearAxis(y_range_name = "y2",axis_label='Digital Connectivity (Pct Access)'), 'right')
plot.line(x, y2, color = "blue", y_range_name = "y2",legend_label = 'Digital Connectivity (Pct Access)')
plot.circle(x, y2, color = "blue", y_range_name = "y2")

#Adding labels
plot.xaxis.axis_label = 'Months'
plot.xaxis.ticker = list(range(1, 13))

#Adding hover tool
plot.add_tools(HoverTool(tooltips=[("Month", "$x"),("Y Value", "@y")]))

plot.legend.location = "top_center"
show(plot)


**Observations**
<br>1. There is a drop from june to august which can be explained by ***vacation period***
<br>2. **Digital Connectivity** = There is a **slight dip in march** in pct_access which describes 'Percentage of students at ***district*** level having at least one page-load event of any product and on a day' when taken average. This dip can be explained by the fact that different states announced lockdown on differnt days in March causing some uncertainity in education sector.
<br>3. **Engagement** = Engagement Index kept on increasing from jan to april because it is a metric measured per 1000 students meaning whoever were accessing the platform there engagement increased with the platform product as lockdown or school shutdowns and exams were happening.
<br>4. Looking at the graph it looks like there is a correlation among these two metrics. Let's find out....

In [None]:
df_6[['pct_access','engagement_index']].corr()

Quite naturally, if more kids start accessing the platform it will result in high engagement with the products.

# 2. What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

In [None]:
pip install openpyxl

In [None]:
cases_df=pd.read_excel("../input/covid-cases/owid-covid-data.xlsx",sheet_name='america')
cases_df=cases_df[['date','new_cases']]

In [None]:
from bokeh.plotting import figure
#Adding month column
cases_df['month']=pd.DatetimeIndex(cases_df['date']).month

#Group by month
cases_monthly_df=cases_df[['month','new_cases']].groupby(['month']).mean().reset_index()

#Merging cases_monthly_df and monthly_df on 'month' column
q2_df=monthly_df.merge(cases_monthly_df,how='left',on='month')

# data for first plot
x = q2_df.month
y1 = q2_df.engagement_index
y2 = q2_df.new_cases

# plot 
plot = figure(width=800,height=500)
plot.line(x, y1, color='red',legend_label='Engagement Index')
plot.circle(x, y1, color='red')
plot.yaxis.axis_label = 'Engagement Index'
plot.y_range = Range1d(0, 400)

#Adding second y axis
plot.extra_y_ranges = {"y2": Range1d(start = 0, end = 300000)}
plot.add_layout(LinearAxis(y_range_name = "y2",axis_label='Daily Cases'), 'right')
plot.line(x, y2, color = "blue", y_range_name = "y2",legend_label = 'Daily Cases')
plot.circle(x, y2, color = "blue", y_range_name = "y2")

#Adding labels
plot.xaxis.axis_label = 'Months'
plot.xaxis.ticker = list(range(1, 13))

#Adding hover tool
plot.add_tools(HoverTool(tooltips=[("Month", "$x"),("Y Value", "@y")]))

plot.legend.location = "top_center"
show(plot)

Even if we ignore the vacation period there is no particular trend or correlation among the COVID cases rising and engagement index which makes sense as engagement index can be drastically affected only under two conditions which are schools opening or lockdown restricting kids to home. That increase is evident for January, February, March and April. Once the lockdown happened and students were at home, that resulted in high and constant engagement index when compared to kids going to school (again ignore the vacation period). Same reasoning can be given for Pct Access. Lets check their correlation once.

In [None]:
q2_df[['engagement_index','pct_access','new_cases']].corr()

***Fluctuations in COVID cases do not play any role in student's engagement or digital connectivity***

#  3. How did student engage with different types of education technology ?

There are two major types of education technology that students majorly interact with. Those are:
<br>**1. Linear Learning** - This kind of learning refer to independent learning activities that students can complete modules at their own pace.
<br>**2. Collaborative Learning** - This kind of learning involves social and human interaction where people get together in Teams to solve the problem. It gives more of a real classroom vibes where students and teachers are interacting.

#### 3.a Lets see which sub category under Linear Learning has highest engagement index and what all products fall in those categories

In [None]:
ed_tech_df=df_6[['Primary Essential Function-Main Category','Primary Essential Function-Sub Category','engagement_index']].groupby(['Primary Essential Function-Main Category','Primary Essential Function-Sub Category']).mean().reset_index()
ed_tech_df1=ed_tech_df.loc[ed_tech_df['Primary Essential Function-Main Category']=='LC']

In [None]:
from bokeh.models import ColumnDataSource, ranges, LabelSet

#Data
x=ed_tech_df1['Primary Essential Function-Sub Category'].tolist()
y=ed_tech_df1.engagement_index.astype('int').tolist()
source = ColumnDataSource(dict(x=x,y=y))

#Creating figure
graph = figure(x_range=x,title = "Student highly engage in Technical Skills Development")
graph.vbar(x=x,top =y,width = 0.5)

#Labelling
graph.xaxis.major_label_orientation = 'vertical'
graph.yaxis.axis_label = 'Engagement Index'
graph.y_range = Range1d(0, 1300)
labels = LabelSet(x='x', y='y', text='y', level='glyph',
        x_offset=-13.5, y_offset=0,source=source
                  , render_mode='canvas')
graph.add_layout(labels)

#Plotting
show(graph)


**Observations:**
<br> **Students are highly engaged in Technical Skills development courses.** Students are aso highly engage in Content Creation and Streaming services. Lets see what products have high engagement in these main categories.

In [None]:
# Products engaggement in Skill development courses, Content creation and Streaming Services
temp=df_6[['Primary Essential Function-Sub Category','Product Name','engagement_index']].loc[(df_6['Primary Essential Function-Sub Category']=='Sites, Resources & Reference - Streaming Services'
                                                   )|(df_6['Primary Essential Function-Sub Category']=='Content Creation & Curation')|
                                                  (df_6['Primary Essential Function-Sub Category']=='Online Course Providers & Technical Skills Development')]
ed_tech_df1=temp.groupby(['Product Name']).mean().reset_index()
temp1=temp[['Primary Essential Function-Sub Category','Product Name','engagement_index']].groupby(['Primary Essential Function-Sub Category','Product Name']).mean().reset_index()
temp2=temp1.loc[temp1['engagement_index'].isnull()==False].reset_index(drop=True)

In [None]:
import numpy as np
from bokeh.plotting import figure
from bokeh.io import curdoc, show
from bokeh.models import ColumnDataSource, Grid, HBar, LinearAxis, Plot

#First chart on Content Creation and Curation
chart1=temp2.loc[temp2['Primary Essential Function-Sub Category']=='Content Creation & Curation'].sort_values(by='engagement_index',ascending=False).head(5).reset_index(drop=False)
y =chart1['Product Name'].tolist()
x = chart1.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=300, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")
#Labelling
p.xaxis.axis_label="Engagement Index"
p.yaxis.axis_label="Content Creation Products"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 12500)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=0,source=source
                  , render_mode='canvas')
p.add_layout(labels)

#Second chart on Technical Skills Development
chart2=temp2.loc[temp2['Primary Essential Function-Sub Category']=='Online Course Providers & Technical Skills Development'].sort_values(by='engagement_index',ascending=False).head(5).reset_index(drop=False)
y =chart2['Product Name'].tolist()
x = chart2.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
q = figure(plot_width=500, plot_height=300, y_range = y,toolbar_location=None)
q.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")
#Labelling
q.xaxis.axis_label="Engagement Index"
q.yaxis.axis_label="Technical Skills Development Products"
q.yaxis.major_label_orientation = 0
q.x_range = Range1d(0, 3500)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=5, y_offset=0,source=source
                  , render_mode='canvas')
q.add_layout(labels)

from bokeh.layouts import row

#Third chart on Streaming Services
chart3=temp2.loc[temp2['Primary Essential Function-Sub Category']=='Sites, Resources & Reference - Streaming Services'].sort_values(by='engagement_index',ascending=False).head(5).reset_index(drop=False)
y =chart3['Product Name'].tolist()
x = chart3.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
r = figure(plot_width=500, plot_height=300, y_range = y,toolbar_location=None)
r.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")
#Labelling
r.xaxis.axis_label="Engagement Index"
r.yaxis.axis_label="Streaming Services Products"
r.yaxis.major_label_orientation = 0
r.x_range = Range1d(0, 3800)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=0,source=source
                  , render_mode='canvas')
r.add_layout(labels)

show(row(p,q,r))

**Observations**
<br>1. **Google Docs is the most used product in Content Creation products.**Second highly engaged product is,Seesaw which
is a student-driven digital portfolio that empowers students to independently document what they are learning at school.
<br>2. There are two mostly used products for technical skill development - **EdOptions Academy and Canvas**.Canvas is a web-based learning management system, or LMS used by learning institutions, educators, and students to access and manage online course learning materials and communicate about skill development and learning achievement. 
<br>3. **Youtube** is highly used among streaming service products.

#### 3.b Lets see which sub category under Collaborative Learning has highest engagement index and what all products fall in those categories

In [None]:
ed_tech_df=df_6[['Primary Essential Function-Main Category','Primary Essential Function-Sub Category','engagement_index']].groupby(['Primary Essential Function-Main Category','Primary Essential Function-Sub Category']).mean().reset_index()
ed_tech_df1=ed_tech_df.loc[ed_tech_df['Primary Essential Function-Main Category']=='CM']

In [None]:
from bokeh.models import ColumnDataSource, ranges, LabelSet
from bokeh.plotting import figure
#Data
x=ed_tech_df1['Primary Essential Function-Sub Category'].tolist()
y=ed_tech_df1.engagement_index.astype('int').tolist()
source = ColumnDataSource(dict(x=x,y=y))

#Creating figure
graph = figure(x_range=x,title = "Students highly engage in Classroom Interaction")
graph.vbar(x=x,top =y,width = 0.5)

#Labelling
graph.xaxis.major_label_orientation = 'vertical'
graph.yaxis.axis_label = 'Engagement Index'
graph.y_range = Range1d(0, 800)
labels = LabelSet(x='x', y='y', text='y', level='glyph',
        x_offset=-13.5, y_offset=0,source=source
                  , render_mode='canvas')
graph.add_layout(labels)

#Plotting
show(graph)


**Observations:**
<br>**Students are highly engaged when they have to response in class or share screen while communicating**.Lets see what products have high engagement in these main categories.

In [None]:
# Products engaggement in colaborative Learning sub categories
temp=df_6[['Primary Essential Function-Sub Category','Product Name','engagement_index']].loc[(df_6['Primary Essential Function-Sub Category']=='Classroom Engagement & Instruction - Assessment & Classroom Response'
                                                   )|(df_6['Primary Essential Function-Sub Category']=='Virtual Classroom - Video Conferencing & Screen Sharing')]
ed_tech_df1=temp.groupby(['Product Name']).mean().reset_index()
temp1=temp[['Primary Essential Function-Sub Category','Product Name','engagement_index']].groupby(['Primary Essential Function-Sub Category','Product Name']).mean().reset_index()
temp2=temp1.loc[temp1['engagement_index'].isnull()==False].reset_index(drop=True)

In [None]:
import numpy as np
from bokeh.plotting import figure
from bokeh.io import curdoc, show
from bokeh.models import ColumnDataSource, Grid, HBar, LinearAxis, Plot

#First chart on Classroom Response
chart1=temp2.loc[temp2['Primary Essential Function-Sub Category']=='Classroom Engagement & Instruction - Assessment & Classroom Response'].sort_values(by='engagement_index',ascending=False).head(5).reset_index(drop=False)
y =chart1['Product Name'].tolist()
x = chart1.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=400, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")
#Labelling
p.xaxis.axis_label="Engagement Index"
p.yaxis.axis_label="Assessment & Classroom Response Products"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 1100)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=0,source=source
                  , render_mode='canvas')
p.add_layout(labels)

#Second chart on Virtual Classrooms
chart2=temp2.loc[temp2['Primary Essential Function-Sub Category']=='Virtual Classroom - Video Conferencing & Screen Sharing'].sort_values(by='engagement_index',ascending=False).head(5).reset_index(drop=False)
y =chart2['Product Name'].tolist()
x = chart2.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
q = figure(plot_width=500, plot_height=400, y_range = y,toolbar_location=None)
q.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")
#Labelling
q.xaxis.axis_label="Engagement Index"
q.yaxis.axis_label="Virtual Classroom - Video Conferencing & Screen Sharing"
q.yaxis.major_label_orientation = 0
q.x_range = Range1d(0, 2500)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=5, y_offset=0,source=source
                  , render_mode='canvas')
q.add_layout(labels)

show(row(p,q))

**Observations**
<br>1. **Google Forms** is the most used product in Assessment and response products.
<br>2.**Zoom and Meet** are the most preferred choice for Virtual Classrooms.

### 4. Companies having maximum product engagement 

In [None]:
company_df=df_6[['Provider/Company Name','engagement_index']].groupby('Provider/Company Name').max().sort_values(by='engagement_index',ascending=False).head(10).reset_index()

#data
y =company_df['Provider/Company Name'].tolist()
x = company_df.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=400, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")

#Labelling
p.xaxis.axis_label="Engagement Index"
p.yaxis.axis_label="Company Name"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 200000)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=0,source=source
                  , render_mode='canvas')
p.add_layout(labels)
show(p)

**Observations:**
<br> 1. **Google LLC provides services - Google Forms, YouTube, Meet** and they had high engagement as we saw in previous section
<br> 2. **Instructure ltd is providing Canvas** which is highly used product for Technical Skill development
<br> 3. Students use Zoom for online classroom interactions which are provided by **Zoom Video Communications**, Inc 

# 5.Demographic analysis of pct_access and Engagement Index 

### 5.a State Level Engagement Index 

In [None]:
from matplotlib import figure
from bokeh.plotting import figure

#CHART 1 SHOWING STATEWISE ENGAGEMENT INDEX
state_df=df_6[['state','engagement_index']].groupby('state').mean().sort_values(by='engagement_index').reset_index()
state_df

#data for engagement_index
y =state_df['state'].tolist()
x = state_df.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=900, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#FDE724")

#Labelling
p.xaxis.axis_label="Engagement Index"
p.yaxis.axis_label="State"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 1000)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=-10,source=source
                  , render_mode='canvas')
p.add_layout(labels)

show(p)

**Observations:**
<br> **New York, North Dakota and New Hampshire** are the highest performing states 

### 5b. Districts having high pct_access 

In [None]:
from matplotlib import figure
from bokeh.plotting import figure

#CHART 1 Showing District wise pct access
dis_df=df_6[['district_id','pct_access']].groupby('district_id').mean().sort_values(by='pct_access',ascending=False).reset_index().head(10)
dis_df=dis_df.sort_values(by='pct_access')

#data for pct_access
y =dis_df['district_id'].astype('str').tolist()
x = dis_df.pct_access.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=700, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#FDE724")

#Labelling
p.xaxis.axis_label="Pct Access"
p.yaxis.axis_label="District"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 8)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=-10,source=source
                  , render_mode='canvas')
p.add_layout(labels)

show(p)

**Observations:**
<br> **3692 district** students has the highest pct access to the products

### 5c.Locale Trend on Engagement Index 

In [None]:
from matplotlib import pyplot as plt
import numpy as np

#CHART 1 SHOWING STATEWISE ENGAGEMENT INDEX
def autopct_func(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

locale_df=df_6[['locale','engagement_index']].groupby('locale').mean().sort_values(by='engagement_index').reset_index()
locale_df

locale_label = locale_df.locale.tolist()
engagement_values = locale_df.engagement_index.tolist()

figure,ax1=plt.subplots(1,1)
explode=(0,0,0,0)
ax1.pie(engagement_values,explode=explode, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#7CDDDD','#FF2E7E','#FF7300','#FFEC00'])

**Observations:**
<br>**Rural engagement is highest** followed by **suburb and town kids at almost same level** and city kids are little behind. Could it be because city kids have more distractions or our dataset is missing locale among states?? Lets dig in...

### 5d. Locale percentages in top 3 states

In [None]:
def autopct_func(values):
    #to get pct values
    def autopct_func2(pct):
        sum_ = sum(values)
        value = int(round(pct*sum_/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=value)
    return autopct_func2


#CHART 1 - New York
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='New York']
c=c.dropna(axis=0)


locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()

figure,(ax1,ax2,ax3)=plt.subplots(1, 3)
explode=(0,0,0)
ax1.axis('equal')
ax1.set_title('New York',loc='left')
ax1.pie(engagement_values,explode=explode, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#7CDDDD','#FFEC00','#FF7300'])

#CHART 2 - North Dakota
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='North Dakota']
c=c.dropna(axis=0)

ax2.axis('equal')
ax2.set_title('North Dakota',loc='left')
locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()
ax2.pie(engagement_values, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#FFEC00'])

#CHART 3 - New Hampshire
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='New Hampshire']
c=c.dropna(axis=0)

ax3.axis('equal')
ax3.set_title('New Hampshire',loc='left')
locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()
ax3.pie(engagement_values, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#FFEC00'])

#Plot Adjustments
plt.subplots_adjust(left=-0.5,
                    bottom=0.1, 
                    right=2, 
                    top=0.5, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

**Observations:**
<br>So it seems like **our dataset does not have values for locale of North Dakota and New Hampshire**. So nothing can be said about the analysis for sure

### 5e. Locale Percentages in bottom 3 states

In [None]:
#CHART 1 - Florida
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='Florida']
c=c.dropna(axis=0)

locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()

figure,(ax1,ax2,ax3)=plt.subplots(1, 3)
ax1.axis('equal')
ax1.set_title('Florida',loc='left')
ax1.pie(engagement_values, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#FF7300'])

#CHART 2 - North Carolina
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='North Carolina']
c=c.dropna(axis=0)

ax2.axis('equal')
ax2.set_title('North Carolina',loc='left')
locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()
ax2.pie(engagement_values, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#7CDDDD','#FFEC00','#FF7300'])

#CHART 3 - Washington
c=df_6[['state','locale','engagement_index']].reset_index(drop=True)
c=c.groupby(['state','locale']).mean().reset_index()
c=c.loc[c.state=='Washington']
c=c.dropna(axis=0)

ax3.axis('equal')
ax3.set_title('Washington',loc='left')
locale_label = c.locale.tolist()
engagement_values = c.engagement_index.tolist()
ax3.pie(engagement_values, radius=2
        ,labels = locale_label,autopct=autopct_func(engagement_values),colors=['#7CDDDD','#FF7300'])

#Plot Adjustments
plt.subplots_adjust(left=-0.5,
                    bottom=0.1, 
                    right=2, 
                    top=0.5, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

# 6. Socio Economic status Impact on Education

### 6a. Percentage of childern living in poverty and pct_access

In [None]:
#Importing and preparing data of children living in poverty
pov_df=pd.read_excel(r"../input/poverty-stats/percentage of children living in poverty.xlsx",skiprows=3)
pov_df.columns = pov_df.columns.astype(str)
pov_df=pov_df[['region','2018']]

#Merging with main df
temp=df_6[['state','pct_access']]
temp=temp.merge(pov_df,how='left',left_on='state',right_on='region')
temp=temp[['state','pct_access','2018']]
temp_1=temp.groupby('state').mean().reset_index().sort_values(by='pct_access',ascending=False)
temp_1.reset_index(drop=True)

In [None]:
temp_2=temp_1.loc[(temp_1['state']!='New York')&(temp_1['state']!='Utah')&(temp_1['state']!='Minnesota')&(temp_1['state']!='Washington')]
# data 
y1 = temp_2['pct_access']
y2 = temp_2['2018']
plt.scatter(y1,y2)
#setting axes labels
plt.xlabel("Pct_Access")
plt.ylabel("Percentage of children living in poverty")

In [None]:
temp_2.corr()#after dropping 4 states

**Observations:**
<br> **If we drop out 4 states - New York, Utah, Minnesota and Washington, we get negative correlation between Pct Access and Percentage children in poverty**. Pct Access talks about district level engagement and above graph shows that where the percentage of children living on poverty is high there engagement in learning is low.

### 6b. Social class impact on education 

In [None]:
temp=df_6[['state','pct_black/hispanic','pct_access']].groupby('state').mean().sort_values(by='pct_black/hispanic',ascending=False).reset_index()

#Filtering to see impact of higher percentage of social class presence
temp=temp.loc[temp['pct_black/hispanic']>20]

In [None]:
y=temp['pct_black/hispanic']
x1=temp['pct_access']

#Creating figure
figure,ax1=plt.subplots(1, 1)
ax1.scatter(x1,y)
#Setting axes labels
ax1.set_xlabel("Pct_Access")
ax1.set_ylabel("Percentage of black/hispanic kids")

temp.corr()

**Observations:**
<br> If we observe **for >20% black/hispanic class, we see that as percentage of black/hispanic class increases district level access to digital learning decreases.** This can point to serious race discrimination in education oppurtunities

# 7. Financial Analysis

This analysis will be performed on the mean value of expenditure buckets provided in the dataset. Mean for thesee buckets were calculated in data cleaning section.Hypothesis is more the expenditure more will be the engagement with products and more will be tthe pct access.

In [None]:
#Checking correlation
temp=df_6[['district_id','pp_total_raw','pct_access','engagement_index']].loc[df_6['pp_total_raw'].isnull()==False]
temp_1=temp[['pp_total_raw','pct_access','engagement_index']].groupby('pp_total_raw').mean().sort_values(by='pp_total_raw').reset_index()
temp_1.corr()

In [None]:
temp_1

Something is weird about 5000 expenditure field. We'll have a look at all the districts with this level of expense to understand if its an outlier or not

In [None]:
temp.loc[(temp.pp_total_raw==5000.0)].groupby('district_id').mean()

Lets investigate further which product is performing exceptionally in district_id 6762

In [None]:
v=df_6[['Product Name','district_id','pp_total_raw','pct_access','engagement_index']].loc[(df_6['pp_total_raw'].isnull()==False)&(temp.pp_total_raw==5000.0)&(df_6.district_id==6762)]
u=v.groupby('Product Name').mean().reset_index().sort_values(by='pct_access',ascending=False)
u.loc[u['pct_access'].isnull()==False]

Google provides docs, drive and classroom services for free. So low expenditure districts are using it heavily thus resulting in high engagement and pct_access. So, we'll drop this 5000 expenditure bucket for our analysis

In [None]:
temp_1.loc[temp_1['pp_total_raw']!=5000.0].corr()

Correlation increases after dropping 5000 expenditure case. So **if company is spending more on schools it is resulting in more engagement of students**

In [None]:
temp_1=temp_1.loc[temp_1['pp_total_raw']!=5000.0]

In [None]:
from bokeh.plotting import figure

#First chart on engaement_index and Expenditures
y =temp_1['pp_total_raw'].astype('str').tolist()
x = temp_1.engagement_index.astype('int').tolist()
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
p = figure(plot_width=500, plot_height=400, y_range = y,toolbar_location=None)
p.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")

#Labelling
p.xaxis.axis_label="Engagement Index"
p.yaxis.axis_label="Expenditures"
p.yaxis.major_label_orientation = 0
p.x_range = Range1d(0, 1100)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=0, y_offset=0,source=source
                  , render_mode='canvas')
p.add_layout(labels)

#Second chart on pct_access and Expenditures
y =temp_1['pp_total_raw'].astype('str').tolist()
x = temp_1.pct_access.tolist()
x = [ '%.2f' % elem for elem in x]
source=ColumnDataSource(dict(x=x,y=y))

#Creating Figure
q = figure(plot_width=500, plot_height=400, y_range = y,toolbar_location=None)
q.hbar(y='y', right='x', source=source, height=0.5, line_color='white', color ="#9ecae1")

#Labelling
q.xaxis.axis_label="Pct Access"
q.yaxis.axis_label="Expenditures"
q.yaxis.major_label_orientation = 0
q.x_range = Range1d(0, 4)
labels = LabelSet(x='x', y='y', text='x', level='glyph',
        x_offset=5, y_offset=0,source=source
                  , render_mode='canvas')
q.add_layout(labels)

show(row(p,q))

**Observations**
<br> More expenditure results in student's engagement in digital learning. **Spending activities are working to get students to involve in digital learning.** 

# Summary

In [None]:
from IPython.display import Image
Image("../input/summary/digital_learning.png")

# Possible Next Steps

<br> 1.This analysis include only external factor responsible for digital learning engagement of students but it does not include psychological factors that motivates students to engage with products. This can be incorporated by taking students' survey regarding their feedback on products.
<br> 2.Robust time series model can be built when enough data is available to forecast engagement rating and this can help to optimise expenditure analysis for future.

 # Web Links of Data used in this Notebook

COVID Cases - https://github.com/owid/covid-19-data/tree/master/public/data/
<br>Kids Percentage living in poverty - https://www.aecf.org/resources/2020-kids-count-data-book/auxiliary-materials


## **If you found this notebook insightful then do not forget to upvote it. Thank you for reading till here.**