In [None]:
# importing useful packages 
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import glob 
import os


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Merging all engagement_data files into one data frame**

In [None]:
file_dir = '/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'
!ls $file_dir

In [None]:
files = glob.glob(os.path.join(file_dir, "*.csv"))

In [None]:
engagement_data = [pd.read_csv(file).assign(district_id=os.path.basename(file).strip(".csv")) for file in files]
engagementDF = pd.concat(engagement_data, ignore_index=True)
engagementDF.head()

# **Handling missing data**

In [None]:
districtsDF = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
districtsDF.head()

In [None]:
productsDF = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
productsDF.head()

**Engagement data missing values**

In [None]:
engagementDF.isnull().sum()

In [None]:
# dropping the null values from "engagement_index" 
engagementDF = engagementDF.dropna(axis=0, subset=['engagement_index'], how='all')
engagementDF

In [None]:
engagementDF.isnull().sum()

In [None]:
# dropping the remaining null values
engagementDF = engagementDF.dropna()
engagementDF

In [None]:
engagementDF.isnull().sum()

**Districts data missing values**

In [None]:
districtsDF.isnull().sum()

In [None]:
# about a half of the 'pp_total_raw' column are missing values
districtsDF=districtsDF.drop(axis=1, columns=['pp_total_raw'])
districtsDF.shape

In [None]:
# dropping the null values from 'state', 'locale' and 'pct_black/hispanic'
districtsDF=districtsDF.dropna(axis=0, subset=['state', 'locale', 'pct_black/hispanic'], how='all')
districtsDF.head(30)

In [None]:
# most of the remaining null values come from Massachusetts 
districtsDF.loc[(districtsDF['state'] == 'Massachusetts')]

In [None]:
districtsDF = districtsDF[districtsDF.state != 'Massachusetts']
districtsDF

In [None]:
districtsDF.isnull().sum()

In [None]:
# dropping the few remaining missing values
districtsDF = districtsDF.dropna()
districtsDF

In [None]:
districtsDF.isnull().sum()

**Products data missing values**

In [None]:
productsDF.isnull().sum()

In [None]:
productsDF=productsDF.dropna(axis=0, subset=['Sector(s)', 'Primary Essential Function'], how='all')
productsDF

In [None]:
productsDF.isnull().sum()

**Merging cleaned data frames**

In [None]:
dataframe = pd.merge(engagementDF, productsDF, left_on='lp_id', right_on='LP ID' )

In [None]:
dataframe['district_id'] = dataframe['district_id'].astype(int)
dataframe.info()

In [None]:
df = pd.merge(dataframe, districtsDF, left_on='district_id', right_on='district_id')
df

In [None]:
# converting "time" to datetime format
df['time'] = pd.to_datetime(df['time'])

# **Correlations**

In [None]:
correlation = df.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
correlation

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(correlation, cmap="YlGnBu", annot=True)

there's a slight visible correlation between:
* pct_free/reduced and state
* pct_free/reduced and locale
* pct_black/hispanic and pct_free/reduced

# **The Covid-19 impact on digital learning in 2020**

**How does the engagement change over time?**

In [None]:
df['month']=df['time'].dt.month
df.info()

In [None]:
plt.scatter(df['time'], df['engagement_index'], alpha=0.2)
plt.title('engagement index')
plt.show()

* The sharpest engagegent rise happened in March due to the first lockdowns

In [None]:
df.groupby(['month'])['engagement_index'].median().plot(title='median engagement index')

* There's a  sharp rise in engagement in March
* Engagement with digital learning drops during summer holidays and rises again when the school year starts which means engagement is closely connected to the course of the school year

**Importing additional data**

Data from The New York Times, based on reports from state and local health agencies.
http://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html

In [None]:
# data on Covid-19 cases in the US from (c) New York Times
casesDF = pd.read_csv('../input/us-states/us-states.csv')
casesDF.head()

In [None]:
casesDF.info()

In [None]:
casesDF['date'] = pd.to_datetime(casesDF['date'])
casesDF.info()

In [None]:
casesDF['month']=casesDF['date'].dt.month
casesDF.groupby(['month']).cases.sum().plot(title='sum of Covid-19 cases in each month')

* The number of Covid-19 cases grows until August and then drops dramatically. In contrast to this, on the previous plots we can see that engagement index grew in August after a fall between April and July. It shows that the pandemic made digital learning more popular but later on the engagement correlates more with the course of the school year.

In [None]:
casesDF.groupby(['month']).cases.sum()[:10]

**How does the engagement with digital learning look like in different states?**

In [None]:
sns.barplot(data=df, x="engagement_index", y="state")

* New York, North Dakota and New Hampshire have the highest engagement index.

In [None]:
df['state'].value_counts()

In [None]:
# creating a pivot table with median engagement index for each state in each month 
# I chose median over mean because it's less sensitive to the extreme scores
pvt_states = pd.pivot_table(df, values="engagement_index", index=["state", "month"], aggfunc = 'median')
pvt_states

In [None]:
# getting state names from the cases data frame
casesDF['state'].unique()

In [None]:
# getting montly sum of cases in each state
pvt_cases = pd.pivot_table(casesDF, values="cases", index=["state", "month"], aggfunc = 'sum')
pvt_cases

In [None]:
# plotting the sum of cases and median engagement index in each state 
plt.figure(figsize = (8,5))
plt.subplot()
plt.plot(pvt_cases.loc[('Connecticut')], label = "Connecticut", color = '#800000')
plt.plot(pvt_cases.loc[('Utah')], label = "Utah", color = '#e6194B')
plt.plot(pvt_cases.loc[('Illinois')], label = "Illinois", color = '#fabed4')
plt.plot(pvt_cases.loc[('California')], label = "California", color = '#9A6324')
plt.plot(pvt_cases.loc[('Ohio')], label = "Ohio", color = '#f58231')
plt.plot(pvt_cases.loc[('Missouri')], label = "Missouri",color = '#ffd8b1')
plt.plot(pvt_cases.loc[('Washington')], label = "Washington", color = '#808000')
plt.plot(pvt_cases.loc[('Indiana')], label = "Indiana", color = '#ffe119')
plt.plot(pvt_cases.loc[('North Carolina')], label = "North Carolina", color = '#fffac8')
plt.plot(pvt_cases.loc[('Virginia')], label = "Virginia", color = '#bfef45')
plt.plot(pvt_cases.loc[('New York')], label = "New York", color = '#3cb44b')
plt.plot(pvt_cases.loc[('New Jersey')], label = "New Jersey", color = '#aaffc3')
plt.plot(pvt_cases.loc[('Wisconsin')], label = "Wisconsin", color = '#469990')
plt.plot(pvt_cases.loc[('Florida')], label = "Florida", color = '#42d4f4')
plt.plot(pvt_cases.loc[('Michigan')], label = "Michigan", color = '#000075')
plt.plot(pvt_cases.loc[('Texas')], label = "Texas", color = '#4363d8')
plt.plot(pvt_cases.loc[('New Hampshire')], label = "New Hampshire", color = '#911eb4')
plt.plot(pvt_cases.loc[('Minnesota')], label = "Minnesota", color = '#dcbeff')
plt.plot(pvt_cases.loc[('North Dakota')], label = "North Dakota", color = '#f032e6')
plt.title('Covid-19 cases in each state')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()

plt.figure(figsize = (8,5))
plt.subplot()
plt.plot(pvt_states.loc[('Connecticut')], label = "Connecticut", color = '#800000')
plt.plot(pvt_states.loc[('Utah')], label = "Utah", color = '#e6194B')
plt.plot(pvt_states.loc[('Illinois')], label = "Illinois", color = '#fabed4')
plt.plot(pvt_states.loc[('California')], label = "California", color = '#9A6324')
plt.plot(pvt_states.loc[('Ohio')], label = "Ohio", color = '#f58231')
plt.plot(pvt_states.loc[('Missouri')], label = "Missouri",color = '#ffd8b1')
plt.plot(pvt_states.loc[('Washington')], label = "Washington", color = '#808000')
plt.plot(pvt_states.loc[('Indiana')], label = "Indiana", color = '#ffe119')
plt.plot(pvt_states.loc[('North Carolina')], label = "North Carolina", color = '#fffac8')
plt.plot(pvt_states.loc[('Virginia')], label = "Virginia", color = '#bfef45')
plt.plot(pvt_states.loc[('New York')], label = "New York", color = '#3cb44b')
plt.plot(pvt_states.loc[('New Jersey')], label = "New Jersey", color = '#aaffc3')
plt.plot(pvt_states.loc[('Wisconsin')], label = "Wisconsin", color = '#469990')
plt.plot(pvt_states.loc[('Florida')], label = "Florida", color = '#42d4f4')
plt.plot(pvt_states.loc[('Michigan')], label = "Michigan", color = '#000075')
plt.plot(pvt_states.loc[('Texas')], label = "Texas", color = '#4363d8')
plt.plot(pvt_states.loc[('New Hampshire')], label = "New Hampshire", color = '#911eb4')
plt.plot(pvt_states.loc[('Minnesota')], label = "Minnesota", color = '#dcbeff')
plt.plot(pvt_states.loc[('North Dakota')], label = "North Dakota", color = '#f032e6')
plt.title('Median engagement index in each state')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()

* New York and New Hampshire have the highest engagement scores.
* North Dakota seems to have a different tendency than the other states.
* There's no visible connection between the number of Covid-19 cases and engagement index. The states with the most Covid-19 cases are Connecticut, Texas and Florida while the highest engagement was observed in New York, New Hampshire and North Dakota.

In [None]:
# looking closer at the North Dakota case
plt.plot(pvt_states.loc[('North Dakota')])
plt.title('engagement index in North Dakota (January - March)')
plt.show()

In [None]:
# checking if it's due to the number of records
df[df.state == 'North Dakota'].groupby(['state', 'month']).month.count()

* There seems to be a fall in engagement after January 2020 but it might be due to the lack of records in later months. 

In [None]:
df.groupby(['state', 'month']).month.count()

* North Dakota has less records than the other states but it's engagement index in January 2020 remains the highest among the states. I think it's an interesting case to analyse but unfortunately there's no data for other months.

# **Products analysis**

**Functions**

In [None]:
# creating a column which contains the general function of each product
df['function']=df['Primary Essential Function'].str.split('-').str[0]

In [None]:
df.function

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
df['function'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
df.groupby(['function']).engagement_index.median().plot(kind='bar', title='median engagement')

* The most frequently occuring function is "Learning & Curriculum"
* The function with the highest median engagement index is "School & District Operations"

In [None]:
# creating a table with engagement index for each function and month
pvt = pd.pivot_table(df, values="engagement_index", index=["function", "month"], aggfunc = 'median')
pvt

In [None]:
pvt.index.levels

In [None]:
plt.plot(pvt.loc[('CM ')], label = "Classroom Management")
plt.plot(pvt.loc[('LC ')], label = "Learning & Curriculum")
plt.plot(pvt.loc[('LC/CM/SDO ')], label = "Other")
plt.plot(pvt.loc[('SDO ')], label = "School & District Operations")
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()

* The sharpest rise can be seen in the "School & District Operations" function. It's problably due to the schools having to quickly adjust to new ways of teaching and administrating. 

**Engagement in different sectors**

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
df['Sector(s)'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
df.groupby(['Sector(s)']).engagement_index.median().plot(kind='bar', title='median engagement')

* The most frequently occuring sector is PreK-12
* The sector with the highest engagement index is the Corporate sector

**Companies and poducts**

In [None]:
# top 10 most frequently occuring companies
df['Provider/Company Name'].value_counts().sort_values(ascending=False)[:10]

In [None]:
df['Provider/Company Name'].value_counts().sort_values(ascending=False)[:10].plot(kind='bar', title='top 10 providers')

In [None]:
df['Product Name'].value_counts().sort_values(ascending=False)[:20].plot(kind='bar', title='20 most popular products')

* Google's products are most popular

**How does student engagement with different types of education technology change over the course of the pandemic?**

In [None]:
pvt2 = pd.pivot_table(df, values="engagement_index", index=["Provider/Company Name", "month"], aggfunc = 'median')
pvt2

In [None]:
plt.plot(pvt2.loc[('Google LLC')], label = "Google LLC")
plt.plot(pvt2.loc[('PBS')], label = "PBS")
plt.plot(pvt2.loc[('IXL Learning')], label = "IXL Learning")
plt.plot(pvt2.loc[('Microsoft')], label = "Microsoft")
plt.plot(pvt2.loc[('The Wikimedia Foundation')], label = "The Wikimedia Foundation")
plt.plot(pvt2.loc[('Dictionary.com')], label = "Dictionary.com")
plt.plot(pvt2.loc[('Houghton Mifflin Harcourt')], label = "Houghton Mifflin Harcourt")
plt.plot(pvt2.loc[('The College Board')], label = "The College Board")
plt.plot(pvt2.loc[('Teaching.com')], label = "Teaching.com")
plt.plot(pvt2.loc[('McGraw-Hill PreK-12')], label = "McGraw-Hill PreK-12")
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title('median engagement for 10 most frequently occuring companies')
plt.show()

* Engagement with Google's products was the highest beggining in March.
* Teaching.com was the first one to experience a sharp rise in engagement.
* McGraw-Hill PreK-12's products had the highest median engagement index before the Covid-19 outbreak in the US.

# **Engagement index and different district characteristics**

In [None]:
# creating a smaller data set
data = df[['engagement_index', 'state', 'locale', 'pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio']]
data

In [None]:
data.info()

**Locale**

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
data['locale'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
sns.barplot(data=data, x="locale", y="engagement_index").set(title='median engagement')

* Most of the districts are located in suburbs.
* Rural districts definitely have the highest median engagement index which is rather surprising compared to what we might initially guess.

**Percentage of students identifying as Black or Hispanic**

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
data['pct_black/hispanic'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
sns.barplot(data=data, x="pct_black/hispanic", y="engagement_index").set(title='median engagement')

* What's interesting is that engagement with digital learning is the highest in districts where there's a majority of Black and/or Hispanic students, even though there's the least number of districts in this category. We can observe how the bars get smaller with the rise of percentage of Black or Hispanic students but when it reaches 80-100% level the bar becomes much larger.

**Percentage of students eligible for free or reduced-price lunch**

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
data['pct_free/reduced'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
x = data['pct_free/reduced'].unique()
order = sorted(x)
sns.barplot(data=data, x="pct_free/reduced", y="engagement_index", order=order).set(title='median engagement')

* The engagement with digital learning is the highest in districts where there's a majority of students eligible for free or reduced-price lunch. The tendency is similar to the previous chart. It might be surprising because we would initially think that the poorer areas would display lower engagement due to difficulties with access to the digital learning technologies. In this case the 80-100% bar is almost twice as high as the 0-20% bar which is the second highest one. 

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
data['county_connections_ratio'].value_counts().plot(kind='bar', title='number of records')

plt.subplot(1,2,2)
sns.barplot(data=data, x="county_connections_ratio", y="engagement_index").set(title='median engagement')

In [None]:
data['county_connections_ratio'].value_counts()

* The engagement is higher in areas with higher connection ratio, however there are very few records from districts with connection ratio above 1. 

# **Conclusion**

The Covid-19 impact on digital learning is mostly visible in the first three months of the year. Later on, the engagement is more determined by the course of the school year as it drops when the holidays start and rises again when the school year starts. The pandemic seems to have made online learning more popular. As we can see looking at the last few months of 2020, the engagement remains high even though there were significantly less Covid-19 cases in the United States. It seems that the interest in digital learning won't drop much in the nearest future.

An interesting thing to notice is that the engagement is the highest in rural areas and districts that are poorer and dominated by Black and/or Hispanic students.