In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

## Fetch Data

In [None]:
path = '/kaggle/input/learnplatform-covid19-impact-on-digital-learning/'

### District information data

The district file `districts_info.csv` includes information about the characteristics of school districts, including data from [NCES](https://nces.ed.gov/) (2018-19), [FCC](https://www.fcc.gov/) (Dec 2018), and [Edunomics Lab](https://edunomicslab.org/). In this data set, we removed the identifiable information about the school districts. We also used an open source tool [ARX](https://arx.deidentifier.org/) [(Prasser et al. 2020)](https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2812) to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset. 

| Name | Description |
| :--- | :----------- |
| district_id | The unique identifier of the school district |
| state | The state where the district resides in |
| locale | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See [Locale Boundaries User's Manual](https://eric.ed.gov/?id=ED577162) for more information. |
| pct_black/hispanic | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data |
| pct_free/reduced | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data |
| county_connections_ratio | `ratio` (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See [FCC data](https://www.fcc.gov/form-477-county-data-internet-access-services) for more information. |
| pp_total_raw | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district. |

In [None]:
districts_info = pd.read_csv(path+'districts_info.csv')
districts_info.head()

### Product information data
The product file `products_info.csv` includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

| Name | Description |
| :--- | :----------- |
| LP ID| The unique identifier of the product |
| URL | Web Link to the specific product |
| Product Name | Name of the specific product |
| Provider/Company Name | Name of the product provider |
| Sector(s) | Sector of education where the product is used |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled |



In [None]:
products_info = pd.read_csv(path + 'products_info.csv')
products_info.head()

### Engagement data
The engagement data are aggregated at school district level, and each file in the folder `engagement_data` represents data from one school district. The 4-digit file name represents `district_id` which can be used to link to district information in `district_info.csv`. The `lp_id` can be used to link to product information in `product_info.csv`.

| Name | Description |
| :--- | :----------- |
| time | date in "YYYY-MM-DD" |
| lp_id | The unique identifier of the product |
| pct_access | Percentage of students in the district have at least one page-load event of a given product and on a given day |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day |


In [None]:
def load_dataframes(path):
    dataframes = {}
    engagement_data_list = os.listdir(path)
    for file in engagement_data_list:
        csv = pd.read_csv(path + file)
        csv['district'] = [file.split('.')[0] for _ in range(len(csv))]
        dataframes[file.split('.')[0]] = csv
    return dataframes
data = load_dataframes(path + 'engagement_data/')

### Missing Data

#### Missing Districts Info Data

In [None]:
plt.figure(figsize=(12,7))
plt.title("Missing Districts Info Data", {'fontsize': 20})
sns.barplot(data = districts_info.isna().sum().reset_index(), y = 'index', x = 0)

In [None]:
di_missing = districts_info.isna().sum()
di_total_rows = len(districts_info)
for col in di_missing.keys():
    print(f"The number of missing values in the {col} column is {di_missing[col]} which is {round(di_missing[col]*100/di_total_rows,2)}%")

#### Missing Products Info Data

In [None]:
plt.figure(figsize=(12,7))
plt.title("Missing Products Info Data", {'fontsize': 20})
sns.barplot(data = products_info.isna().sum().reset_index(), y = 'index', x = 0)

In [None]:
pi_missing = products_info.isna().sum()
pi_total_rows = len(products_info)
for col in pi_missing.keys():
    print(f"The number of missing values in the {col} column is {pi_missing[col]} which is {round(pi_missing[col]*100/pi_total_rows,2)}%")

#### Missing Enggagement Data

In [None]:
total_rows = 0
missing = data[list(data.keys())[0]].isna().sum()
for key in data.keys():
    total_rows += len(data[key])
    if key != list(data.keys())[0]:
        missing += data[key].isna().sum()
plt.figure(figsize=(12,7))
plt.title("Missing Enggagement Data", {'fontsize': 20})
sns.barplot(data = missing.reset_index(), y = 'index', x = 0)

In [None]:
print("The total number of rows in the dataset is", total_rows)

In [None]:
for col in missing.keys():
    print(f"The number of missing values in the {col} column is {missing[col]} which is {round(missing[col]*100/total_rows,2)}%")

## Univariate analysis

In [None]:
district = list(data.keys())[0]
sample_df = data[district]

In [None]:
sample_df[['pct_access','engagement_index']].describe()

## Visualization

In [None]:
def plot_hist(df:pd.DataFrame, column:str, color:str)->None:
    plt.figure(figsize=(12, 7))
    sns.displot(data=df, x=column, color=color, bins = 100, kde=True, height=7, aspect=2)
    plt.title(f'Distribution of {column}', size=20, fontweight='bold')
    plt.show()

def plot_count(df:pd.DataFrame, column:str) -> None:
    plt.figure(figsize=(12, 7))
    sns.countplot(data=df, x=column, orient = 'h')
    plt.title(f'Distribution of {column}', size=20, fontweight='bold')
    plt.show()

def plot_correlation(df:pd.DataFrame, title:str) -> None:
    f = plt.figure(figsize=(19, 15))
    plt.matshow(df.corr(), fignum=f.number)
    plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
    plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=14)
    plt.title('Correlation Matrix', fontsize=16)

In [None]:
plot_hist(sample_df[sample_df['pct_access'] <= sample_df['pct_access'].quantile(0.95)], 'pct_access', 'blue')

In [None]:
plot_hist(sample_df[sample_df['engagement_index'] <= sample_df['engagement_index'].quantile(0.95)], 'engagement_index', 'blue')

## Merged DataFrame

In [None]:
super_df = pd.concat([data[key] for key in data])
len(super_df)

In [None]:
def month(x):
    m = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    return m[int(x.split('-')[1])-1] 
def day(x):
    return int(x.split('-')[2]) 
super_df['month'] = super_df['time'].apply(month)
super_df['day'] = super_df['time'].apply(day)

In [None]:
super_df.head()

In [None]:
super_df[['pct_access', 'engagement_index']].describe().round(2)

In [None]:
plot_hist(super_df[super_df['engagement_index'] <= super_df['engagement_index'].quantile(0.95)].sample(10000), 'engagement_index', 'blue')

## Plots

In [None]:
school_locale = districts_info[['district_id', 'locale']].groupby('locale').count().reset_index()
school_locale

In [None]:
fig = plt.figure(figsize=(12,7))
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
locale = list(school_locale['locale'])
num_schools = list(school_locale['district_id'])
ax.pie(num_schools, labels = locale,autopct='%1.2f%%')
plt.title("Distribution of schools by locale", {'fontsize':20})
plt.show()

In [None]:
school_state = districts_info[['district_id', 'state']].groupby('state').count().reset_index()
plt.figure(figsize=(12,7))
plt.title("Distribution of schools by state", {'fontsize':20})
sns.barplot(y = school_state['state'], x = school_state['district_id'])

In [None]:
companies_per_product = products_info[['Provider/Company Name','Product Name']].groupby('Provider/Company Name').count().sort_values('Product Name').tail(20).reset_index()
plt.figure(figsize=(12,7))
plt.title("Distribution of Products per Company", {'fontsize':20})
sns.barplot(y = companies_per_product['Provider/Company Name'], x = companies_per_product['Product Name'])

In [None]:
s1 = set(super_df['lp_id'].dropna().apply(int))
s2 = set(products_info['LP ID'].dropna())
print("Products that exist in the products_info but not in the engagement_data are", s2.difference(s1))

In [None]:
lp, name = list(products_info['LP ID']), list(products_info['Product Name'])
lp_to_name = {}
for i in range(len(lp)):
    lp_to_name[int(lp[i])] = name[i]
most_popular_products = super_df[['lp_id','time']].groupby('lp_id').count().sort_values('time').tail(20).reset_index()

decoded_ids = []
number_of_rows = []
for i in range(len(most_popular_products['lp_id'])):
    if int(list(most_popular_products['lp_id'])[i]) in lp_to_name:
        decoded_ids.append(lp_to_name[int(list(most_popular_products['lp_id'])[i])])
        number_of_rows.append(list(most_popular_products['time'])[i])

In [None]:
plt.figure(figsize=(12,7))
plt.title("Distribution of Data Points per Product", {'fontsize':20})
sns.barplot(y = decoded_ids, x = number_of_rows)

### What is the picture of digital connectivity and engagement in 2020?

In [None]:
percent_page_load = super_df.groupby('district').mean().sort_values('pct_access', ascending = False).head(10).reset_index()

plt.figure(figsize=(12,7))
plt.title("Districts with largest percentage of page loads", {'fontsize':20})
sns.barplot(x = percent_page_load['district'], y = percent_page_load['pct_access'])

In [None]:
most_engaged = super_df.groupby('district').mean().sort_values('engagement_index', ascending = False).head(10).reset_index()
plt.figure(figsize=(12,7))
plt.title("Most Engaged districts", {'fontsize':20})
sns.barplot(x = most_engaged['district'], y = most_engaged['engagement_index'])

In [None]:
percpl_day = super_df.groupby('time', sort = False).mean().reset_index()
plt.title("Percentage of page loads per day", {'fontsize':20})
sns.lineplot(data=percpl_day, x="time", y="pct_access")

In [None]:
eng_day = super_df.groupby('time', sort = False).mean().reset_index()
plt.title("Percentage of page loads per day", {'fontsize':20})
sns.lineplot(data=eng_day, x="time", y="engagement_index")

In [None]:
percpl_month = super_df.groupby('month', sort = False).mean().reset_index()
plt.title("Percentage of page loads per month", {'fontsize':20})
sns.lineplot(data=percpl_month, x="month", y="pct_access")

In [None]:
eng_month = super_df.groupby('month', sort = False).mean().reset_index()
plt.title("Engagement per month", {'fontsize':20})
sns.lineplot(data=eng_month, x="month", y="engagement_index")

## Correlation

In [None]:
sns.heatmap(super_df[['pct_access','engagement_index']].dropna().corr(), annot = True, cmap= 'coolwarm')

# <span style = ' color:Blue; font-family: arial'>Answers to Challenge Questions </span>

### 1. What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

In [None]:
eng_month = super_df.groupby('month', sort = False).mean().reset_index()
plt.title("Engagement per month", {'fontsize':20})
sns.lineplot(data=eng_month, x="month", y="engagement_index")

The Engagement index kept increasing when the pandemic started. It started decresing in June and July, where most schools are out. It then kept increasing in September.

### 2. How does student engagement with different types of education technology change over the course of the pandemic?

##### <i>Here are the products with the largest average engagement index</i>

In [None]:
super_df.groupby('lp_id').mean().sort_values('engagement_index', ascending  = False).head(5)

In [None]:
l = super_df.groupby('lp_id').mean().sort_values('engagement_index', ascending  = False).head().reset_index()['lp_id']
for i in range(len(l)):
    p1 = super_df[super_df['lp_id'] == l[i]].groupby('month', sort = False).mean().reset_index()
    plt.title("Engagement with Most Popular Products", {'fontsize' : 20} )
    sns.lineplot(data=p1, x="month", y="engagement_index")

### 3. How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

In [None]:
eng = super_df[['district', 'engagement_index']].groupby('district').mean().reset_index()
eng['district'] = eng['district'].apply(int)
eng.columns = ['district_id', 'engagement_index']

district_engagement = pd.merge(districts_info.set_index('district_id'), eng.set_index('district_id'), on = 'district_id')
district_engagement

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By State", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('state').mean().reset_index(), y = 'state', x = 'engagement_index')

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By Locale", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('locale').mean().reset_index(), y = 'locale', x = 'engagement_index')

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By Demography: Percentage of Black/Hispanic", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('pct_black/hispanic').mean().reset_index(), y = 'pct_black/hispanic', x = 'engagement_index')

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By Socio-Economic Status: Percentage of students with free/reduced price lunch", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('pct_free/reduced').mean().reset_index(), y = 'pct_free/reduced', x = 'engagement_index')

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By Socio-Economic Status: County Connections Ratio", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('county_connections_ratio').mean().reset_index(), y = 'county_connections_ratio', x = 'engagement_index')

In [None]:
plt.figure(figsize=(12,7))
plt.title("Engagement By Socio-Economic Status: Per Pupil Total Expenditure", {'fontsize':20})
sns.barplot(data = district_engagement.groupby('pp_total_raw').mean().reset_index(), y = 'pp_total_raw', x = 'engagement_index')

### 4. Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

#### <i>Let's select districts 7975 and 7164 from California</i>

In [None]:
cali1 = super_df[super_df['district'] == '7975'].groupby('month', sort = False).mean().reset_index()
cali2 = super_df[super_df['district'] == '7164'].groupby('month', sort = False).mean().reset_index()
plt.title("Monthly Engagement California", {'fontsize':20})
sns.lineplot(data=cali1, x="month", y="engagement_index")
sns.lineplot(data=cali2, x="month", y="engagement_index")

In [None]:
cali1 = super_df[super_df['district'] == '7975'].groupby('month', sort = False).mean().reset_index()
cali2 = super_df[super_df['district'] == '7164'].groupby('month', sort = False).mean().reset_index()
plt.title("Monthly Access of Educational Tools California", {'fontsize':20})
sns.lineplot(data=cali1, x="month", y="pct_access")
sns.lineplot(data=cali2, x="month", y="pct_access")

The State of California had announced the shutdown of schools on March 13