In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.precision',3)
districts_info = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products_info = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
##All files given in "Engagement Data" Folder has been combined into one
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
all_files = glob.glob(path + "/*.csv")

lst = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df['district_id'] = int(os.path.basename(filename)[0:4])
    lst.append(df)

frame = pd.concat(lst, axis=0, ignore_index=True)

In [None]:
##Now I will consolidate all input data into one CSV file so that we have all data at one place

result = pd.merge(frame,districts_info,on ='district_id')
products_info.rename({"LP ID":"lp_id"},axis='columns',inplace = True) #To make common column name
final_csv = pd.merge(result,products_info,on ='lp_id')

In [None]:
final_csv['time'] = final_csv['time'].astype('datetime64')
final_csv['month'] = pd.DatetimeIndex(final_csv['time']).month

In [None]:
df=final_csv.groupby(['Product Name','month'],sort = True).engagement_index.mean().reset_index()

#Now that our data is ready, we will start exploring to answer the questions asked

In [None]:
df

In [None]:
#We will see unique products available and their engagement
# Month on Month graphical variation in engagement rate
unique_prods = dict(df['Product Name'].value_counts())

#How does student engagement with different types of education technology change over the course of the pandemic? ------------Out of 369 products 346 have engagement for 10 months in a year, 322 have engagement throughout the year. For Simplicty I have plotted for products, where engagement is >10 month.Engagement pattern different for different product. Some had more engagement at start of the year, some had gain popularity in later part.Most of the product engagement was lowest in Jul'20

In [None]:
for key,value in unique_prods.items():
    if value>=10:
        df[df['Product Name']==key].plot(kind = 'bar',x='month', y ='engagement_index',title = key)

## What is the picture of digital connectivity and engagement in 2020?
#df1 is a Filtered Dataframe where at least one page-load event of a given product and on a given day Here it is also visible that July'20 had the lowest count for students,(i.e. count of students who did at least one page load event)

In [None]:
df1= final_csv[final_csv['pct_access']>0]
df1.groupby(['month'],sort = True).lp_id.count().reset_index().plot(x='month', y ='lp_id',title ='StudentCount')

#How does student engagement with online learning platforms relate to different geography? Demographic context #(e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

In [None]:
df2 = df1.groupby(['month','locale'],sort = True).lp_id.count().reset_index()

In [None]:
df2

In [None]:
#In the graph below it is visible that Most engagemnt is from "Suburbs" follwed by "City" which is distant second.
#"Town" area has very low engagement rate.
df2.pivot(index = 'month',columns = 'locale',values = 'lp_id').plot(title = 'StudentCount by Locale')

In [None]:
#Following Graph shows engagement Sector(s) wise
#Courses offered from Pre-12 are most popular.
#Products offered to School/College Students and Corporate employees have large number of takers.
df3 = df1.groupby(['month','Sector(s)'],sort = True).lp_id.count().reset_index()
df3.pivot(index = 'month',columns = 'Sector(s)',values = 'lp_id').plot(figsize = (20,10),title = 'StudentCount by Sector(s)')

In [None]:
df1.groupby(['county_connections_ratio'],sort = True).lp_id.count().reset_index()

#Majority of the broadband type falls in one category only. This may not give any insight on high/low engagement ratio.

In [None]:
#Distribution based on ethinicity i.e. pct_black/hispanic
ethinic_dist = df1.groupby(['pct_black/hispanic'],sort = True).lp_id.count().reset_index()
ethinic_dist['%lp_id'] = ethinic_dist['lp_id']/ethinic_dist['lp_id'].sum()
ethinic_dist.plot.barh(y='%lp_id',x='pct_black/hispanic',title ='EthnicityDistribution')

In [None]:
ethinic_dist_monthly=df1.groupby(['month','pct_black/hispanic'],sort = True).lp_id.count().reset_index()
ethinic_dist_monthly.pivot(index = 'month',columns = 'pct_black/hispanic',values = 'lp_id').plot(ylabel ='lp_id' ,title = 'StudentCount by pct_black/hispanic')

# Now we will see some State wise stats

In [None]:

state_wise = df1.groupby(['state']).lp_id.count().reset_index()
state_wise.sort_values('lp_id',ascending = True).plot.barh(x='state',y='lp_id',title='state wise')

In [None]:
state_wise_distribution = df1.groupby(['pp_total_raw','state']).lp_id.count().reset_index()

In [None]:
state_expenditure= state_wise_distribution.pivot(index = 'state',columns = 'pp_total_raw',values = 'lp_id').fillna(value=0)
state_expenditure.sort_values('state',ascending=False).plot(kind='barh',stacked=True,figsize=(20,10),title='Per-pupil total expenditure category')

In [None]:
#State/Federal Expenditure in Different Category
state_expenditure

#If we see the Table and two graphs above: ##1. We dont have state expenditure data for the State "Connecticut" which has highest engagement ##2. "Utah" the state with 2nd highest engagement, has most engagement in [10000,12000[ and [6000,8000[ category ##3. "Illinois" and "Massachusetts" has high engament in expenditure category ##4. "New York" and "New Jersey" 'District of Columbia" have expenditure in upeer category

#Now we will see what is the total product vs engagement. I.e. we will see total product vs Engagement in that product 

In [None]:

non_zero_engagement=df1.groupby(['month','state']).lp_id.count().reset_index() #Products where engagement is seen
all_engagement=final_csv.groupby(['month','state']).lp_id.count().reset_index() # All products

In [None]:
merged_df = pd.merge(non_zero_engagement,all_engagement,left_on=['month','state'],right_on=['month','state'])
merged_df['%engagement'] = merged_df['lp_id_x']/merged_df['lp_id_y']
merged_df_pivot = merged_df.pivot(index = 'state',columns = 'month', values = '%engagement')

In [None]:
merged_df_pivot

In [None]:
merged_df_pivot.plot(kind='barh',stacked=True,figsize=(20,10),title='MoM %engagement')

#States like 'North Dakota' and 'Minnesota' have engagement only for few months #States where State and Federal expenditure is in higher bucket(refer "state_expenditure" table above), have high conversion ratio(i.e. more page load event seen month on month)