# Exploratory Data Analysis of COVID-19 Impact on Digital Learning

## Importing libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import Markdown, display, Image, display_html
from scipy.stats import skew

## District information data

The district file `districts_info.csv` includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. In this data set, identifiable information about the school districts have been removed. An open source tool ARX (Prasser et al. 2020) have bben used to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

| Name                   | Description                                                                                                                                                                                                                                                                              |
|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| district_id            | The unique identifier of the school district                                                                                                                                                                                                                                             |
| state                  | The state where the district resides in                                                                                                                                                                                                                                                  |
| locale                 | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information.                                                                                                          |
| pct_black/hispanic     | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data                                                                                                                                                                                       |
| pct_free/reduced       | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data                                                                                                                                                                              |
| countyconnectionsratio | ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.                                                                         |
| pptotalraw             | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district. |

In [None]:
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

### General statistics

In [None]:
# rows and columns in the df
districts_df.shape

In [None]:
districts_df.info()

### Duplicates

In [None]:
def drop_duplicates(df):
    old = df.shape[0]
    df.drop_duplicates(inplace=True)
    new = df.shape[0]
    count = old - new
    if (count == 0):
        print("No duplicate rows were found.")
    else:
        print(f"{count} duplicate rows were found and removed.")

In [None]:
drop_duplicates(districts_df)

### Missing values

In [None]:
# UTITLITY FUNCTIONS

def percent_missing_values(df):

    # Calculate total number of cells in dataframe
    totalCells = np.product(df.shape)

    # Count number of missing values per column
    missingCount = df.isnull().sum()

    # Calculate total number of missing values
    totalMissing = missingCount.sum()

    # Calculate percentage of missing values
    print("The dataset contains", round(((totalMissing/totalCells) * 100), 2), "%", "missing values.")
    
def count_missing_rows(df):

    # Calculate total number rows with missing values
    missing_rows = sum([True for idx,row in df.iterrows() if any(row.isna())])

    # Calculate total number of rows
    total_rows = df.shape[0]

    # Calculate the percentage of missing rows
    print(f"{missing_rows} rows({round(((missing_rows/total_rows) * 100), 2)}%) contain atleast one missing value.")

# Function to calculate missing values by column
def missing_values_table(df):
    
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * mis_val / len(df)

    # Data type of missing values
    mis_val_dtype = df.dtypes
    
    # Total unique values in each column
    unique_val = df.nunique()

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent, unique_val, mis_val_dtype], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Missing Values', 2: 'Unique Values',  3: 'Dtype',})

    # Sort the table by percentage of missing descending and remove columns with no missing values
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,0] != 0].sort_values(
    '% of Missing Values', ascending=False).round(2)
    
    # Reset the index as a column
    mis_val_table_ren_columns.reset_index(inplace=True)
    mis_val_table_ren_columns.rename(columns={'index': 'Columns'}, inplace=True)

    # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    if mis_val_table_ren_columns.shape[0] == 0:
        return

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [None]:
percent_missing_values(districts_df)
count_missing_rows(districts_df)

In [None]:
missing_values_table(districts_df)

From the missing table we can observe a pattern. The last 3 columns have the same number of missing values. This indicates that we should investigate to see if the all the missing values of these 3 columns exist together in the same rows.

In [None]:
# number of rows with missing values for the whole dataset
count_missing_rows(districts_df)

In [None]:
# number of rows with missing values for the group('state', 'locale', 'pct_black/hispanic')
count_missing_rows(districts_df[['state', 'locale', 'pct_black/hispanic']])

As predicted these 57 rows contain all the missing values from the columns state', 'locale', and 'pct_black/hispanic'. Since we have too many missing values in the same rows, we will remove them.

In [None]:
districts_df = districts_df[districts_df['state'].notna()]
districts_df.reset_index(drop=True, inplace=True)

Let's check the updated missing values table.

In [None]:
missing_values_table(districts_df)

Let's see the ratio of each unique value in each of these columns.

In [None]:
districts_df.pp_total_raw.value_counts()

In [None]:
districts_df['pct_free/reduced'].value_counts()

In [None]:
districts_df.county_connections_ratio.value_counts()

I will impute missing values in the columns `pp_total_raw` and `pct_free/reduced` based on the mode value of the same `state` and/or `locale`. This is because I assume that there is some connection between the values of `pp_total_raw` and `pct_free/reduced` thath are in the same `state` and/or `locale`. For the column `county_connections_ratio`, we will impute its missing values by its mode as evey value except one is equal to the mode.

In [None]:
# UTILITY FUNCTIONS

def fix_missing_mode(df, cols):
    for col in cols:
        mode = df[col].mode()[0]
        count = df[col].isna().sum()
        df[col] = df[col].fillna(mode)
        if type(mode) == 'str':
            print(f"{count} missing values in the column {col} have been replaced by its mode value \'{mode}\'.")
        else:
            print(f"{count} missing values in the column {col} have been replaced by its mode value {mode}.")

def fix_missing_value(df, col, value):
    count = df[col].isna().sum()
    df[col] = df[col].fillna(value)
    if type(value) == 'str':
        print(f"{count} missing values in the column {col} have been replaced by \'{value}\'.")
    else:
        print(f"{count} missing values in the column {col} have been replaced by {value}.")

In [None]:
fix_missing_mode(districts_df, ['county_connections_ratio'])

In [None]:
missing_values_table(districts_df)

In [None]:
locale_df = districts_df.groupby('locale')['pp_total_raw'].agg(lambda x: x.value_counts().index[0])

for idx,row in districts_df.iterrows():
    if pd.isna(districts_df.at[idx, 'pp_total_raw']):
        value = locale_df[row['locale']]
        districts_df.at[idx, 'pp_total_raw'] = value

In [None]:
locale_df = districts_df.groupby('locale')['pct_free/reduced'].agg(lambda x: x.value_counts().index[0])

for idx,row in districts_df.iterrows():
    if pd.isna(districts_df.at[idx, 'pct_free/reduced']):
        value = locale_df[row['locale']]
        districts_df.at[idx, 'pct_free/reduced'] = value

## Product information data

The product file `products_info.csv` includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

| Name                       | Description                                                                                                                                                                                                                                                                                                                    |
|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LP ID                      | The unique identifier of the product                                                                                                                                                                                                                                                                                           |
| URL                        | Web Link to the specific product                                                                                                                                                                                                                                                                                               |
| Product Name               | Name of the specific product                                                                                                                                                                                                                                                                                                   |
| Provider/Company Name      | Name of the product provider                                                                                                                                                                                                                                                                                                   |
| Sector(s)                  | Sector of education where the product is used                                                                                                                                                                                                                                                                                  |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled |

In [None]:
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

### General statistics

In [None]:
# rows and columns in the df
products_df.shape

In [None]:
products_df.info()

### Duplicates

In [None]:
drop_duplicates(products_df)

### Missing values

In [None]:
percent_missing_values(products_df)
count_missing_rows(products_df)

In [None]:
missing_values_table(products_df)

## Engagement data

The engagement data are aggregated at school district level, and each file in the folder `engagement_data` represents data from one school district. The 4-digit file name represents `district_id` which can be used to link to district information in `district_info.csv`. The `lp_id` can be used to link to product information in `product_info.csv`.

| Name             | Description                                                                                                    |
|------------------|----------------------------------------------------------------------------------------------------------------|
| time             | date in "YYYY-MM-DD"                                                                                           |
| lp_id            | The unique identifier of the product                                                                           |
| pct_access       | Percentage of students in the district have at least one page-load event of a given product and on a given day |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day                         |

For better analysis we will concatenate all the dataframes read from the folder `engagement_data` vertically into a single dataframe.

In [None]:
PATH = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 

temp = []

for district in districts_df.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    temp.append(df)
    
    
engagement_df = pd.concat(temp)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.sample(5)

In [None]:
engagement_df