### Learnplatform Covid19 Impact on Digital Learning

##### This Notebook is deivided into Two main parts

1. Data Pre-processing
2. Exploratory Data Analysis (EDA)

## Data Pre-processing

#### Data Reading and Pre-processing

Importing Required packages and Libraries

In [None]:
# Importing necessary packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re

import warnings
warnings.filterwarnings("ignore")


### Path to different Data files, scripts files and files 

In [None]:
districts_data_path = "../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv"
products_data_path = "../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv"
engagements_data_path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"

### Loading datasets

In [None]:
districts_data = pd.read_csv(districts_data_path)
products_data = pd.read_csv(products_data_path)

products_data.info()
districts_data.info()

#### Showing sample data

In [None]:
products_data.sample(4)

In [None]:
districts_data.sample(4)

#### Calculating the Missing Values, NA

In [None]:
# Functions to Calculate Missing Values and droping columns

def drop_columns(self, df, column_list):
        df_new = df.drop(column_list, axis=1)

        return df_new

def total_percent_missing_data(df):

        # Calculate total number of cells in dataframe
        totalCells = np.product(df.shape)

        # Count number of missing values per column
        missingCount = df.isnull().sum()

        # Calculate total number of missing values
        totalMissing = missingCount.sum()

        # Calculate percentage of missing values
        return round(((totalMissing/totalCells) * 100), 2)


def missing_data_per_column(df):
        item_list = []
        row_list = []
        new_columns=['Column', 'No. of Missing Values', '% Missing Values per column']
        total_no_data_per_column = df.shape[0]-1
        i=0
        for item in df.columns:
            no_missing_values = df[item].isna().sum()
            percentage = str(round(((no_missing_values/total_no_data_per_column) * 100), 2))+" %"
            row_list.append(item)
            row_list.append(no_missing_values)
            row_list.append(percentage)
            item_list.append(row_list)
            row_list = []

        df_data = pd.DataFrame(item_list, columns = new_columns)
        return df_data


missing_values = total_percent_missing_data(districts_data)
missing_df = missing_data_per_column(districts_data)

print(f" Summary of Missing Values in districts_data : {missing_values} %")
print("Missing values per column in districts_data")
missing_df

In [None]:
# Checking for Pattern in Missing Values
nan_rows = districts_data[districts_data['state'].isna() & districts_data['locale'].isna()]
nan_rows1 = districts_data[districts_data['state'].isna()].count().sum()
nan_rows2 = districts_data[districts_data['state'].isna() & districts_data['locale'].isna() & districts_data['pct_black/hispanic'].isna() & districts_data['pct_free/reduced'].isna() & districts_data['county_connections_ratio'].isna() & districts_data['pp_total_raw'].isna()].count().sum()
print("The number of columns with Missing 'state' values and Missinng all columns except 'district_id' are: ", nan_rows1, nan_rows2)
nan_rows.sample(10)

It can be concluded, that all data that have missing state, have also missing locale, pct_black/hispanic, pct_free/reduced, county_connections_ratio, pp_total_raw

#### Droping rows having null values

In [None]:
# Droping all rows with null values in state, locale, etc
districts_data = districts_data[districts_data.state.notna()].reset_index(drop=True)
districts_data.info()

In [None]:
missing_values = total_percent_missing_data(products_data)
missing_df = missing_data_per_column(products_data)

print(f" Summary of Missing Values in products_data : {missing_values} %")
print("Missing values per column in products_data")
missing_df

#### Filling Missing Values in Products_data using Median or Mode 

In [None]:
fig, axes = plt.subplots(figsize=(16,4), nrows=1, ncols=2)
products_data["Sector(s)"].value_counts().plot.bar(ax=axes[0], title="Sector(s)")
products_data["Primary Essential Function"].value_counts().plot.bar(ax=axes[1], title="Primary Essential Function")

In [None]:
# Filling the missing Values with Mode, because we're dealing with Categorical values
try:
    products_data["Primary Essential Function"] = products_data["Primary Essential Function"].fillna(products_data["Primary Essential Function"].mode()[0])
    products_data["Sector(s)"] = products_data["Sector(s)"].fillna(products_data["Sector(s)"].mode()[0])
    products_data["Provider/Company Name"] = products_data["Provider/Company Name"].fillna(products_data["Provider/Company Name"].mode()[0])

except Exception as eeeee:
    print(e)

In [None]:
missing_df = missing_data_per_column(products_data)
missing_df

#### Dealing with Missing Values in Districts_data
Dealing with;

1. pct_free/reduced : percentage of students eliglble for free or reduced lunch
2. county_connections_ratio : ratio of high internet speeds
3. pp_total_raw : sum of local and federal expenditure per pupil

In [None]:
fig, axes = plt.subplots(figsize=(15,4), nrows=1, ncols=3)
districts_data['pct_free/reduced'].value_counts().plot.bar(ax=axes[0], title="districts pct_free/reduced")
districts_data['county_connections_ratio'].value_counts().plot.bar(ax=axes[1], title="districts county_connections_ratio")
districts_data['pp_total_raw'].value_counts().plot.bar(ax=axes[2], title="districts pp_total_raw")

#### Filling the Missing data/Values with Mode

In [None]:
# Filling the missing Values with Mode, because we're dealing with Categorical values
try:
    districts_data['pct_free/reduced'] = districts_data['pct_free/reduced'].fillna(districts_data['pct_free/reduced'].mode()[0])
    districts_data['county_connections_ratio'] = districts_data['county_connections_ratio'].fillna(districts_data['county_connections_ratio'].mode()[0])
    districts_data['pp_total_raw'] = districts_data['pp_total_raw'].fillna(districts_data['pp_total_raw'].mode()[0])

except Exception as eeeee:
    print(e)
    
districts_data.info()

### Pre-Processing Engagement data, joining with other datasets
- Joining the individual data into one dataset/file, by adding district_id to identify a particular district

Dataset Descriptions;
1. time	: date in "YYYY-MM-DD"
2. lp_id	:The unique identifier of the product
3. pct_access	:Percentage of students in the district have at least one page-load event of a given product and on a given day
4. engagement_index	:Total page-load events per one thousand students of a given product and on a given day

In [None]:
# Concatinating all engagement data of all districts
data_set = [] # Initializing empty dataset list

# Taking district id's from districts_data,
for district in districts_data.district_id.unique():
    new_df = pd.read_csv(f'{engagements_data_path}/{district}.csv', index_col=None, header=0)
    new_df["district_id"] = district
    data_set.append(new_df)
    
    
engagements_data = pd.concat(data_set)
engagements_data = engagements_data.reset_index(drop=True)

In [None]:
# engagements_data.info()
engagements_data.sample(10)

#### Checking for Missing Values

In [None]:
missing_df = missing_data_per_column(engagements_data)
missing_df

In [None]:
engagements_data.describe().T

#### Dealing with "pct_access", and "engagement_index" columns
- Using Mean filling Method, to fill pct_access, and engagement_index, because they are float values.

In [None]:
# Filling the missing Values with Mean, because we're dealing with float values
try:
    engagements_data['pct_access'] = engagements_data['pct_access'].fillna(engagements_data['pct_access'].mean())
    engagements_data['engagement_index'] = engagements_data['engagement_index'].fillna(engagements_data['engagement_index'].mean())

except Exception as e:
    print(e)
    
engagements_data.info()

In [None]:
missing_df = missing_data_per_column(engagements_data)
missing_df

#### Checking for "time" column of engagements_dataset

In [None]:
dict_data = {}

for district_id in engagements_data.district_id.unique():
    new_dataframe = pd.DataFrame()
    number = engagements_data[engagements_data['district_id']==district_id].time.unique()
    dict_data[district_id] = len(number)


In [None]:
pd.DataFrame(dict_data.items()).describe()

#### Finding
It can be seen that, more that 75% of the data, have time - date/days 366 engagements, therefore, for Better Analysis, I'll drop the districts that has less than 366 days.

In [None]:
list_district = []
for district_id in engagements_data.district_id.unique():
    number = engagements_data[engagements_data['district_id']==district_id].time.unique()
    if len(number)!=366:
        list_district.append(district_id)
        

#### Droping these districts and all their records

In [None]:
# list_district
# Droping all rows with district_id found in list_district
for district_id in list_district:
    engagements_data = engagements_data[engagements_data.district_id==district_id].reset_index(drop=True)
# engagements_data

In [None]:
# Checking the district_id's we dropped
engagements_data
for district_id in list_district:
    if (engagements_data['district_id']==district_id).any():
        print("Something went wrong in dropping the rows")
        
    else:
        print(f"Dropping all rows having district_id: {district_id} successfully")
    

## Exploratory Data Analysis (EDA)

### Analysis Focus,
1. The state of digital learning in 2020,
2. How the engagement of digital learning relates to factors such as district demographics, broadband access, and state/national level policies and events

#### Districts_data EDA

In [None]:
districts_data.sample(3)

#### List of US States and their Abbreviations
us_state_abbrev = {
    'Alabama': 'AL','Alaska': 'AK','American Samoa': 'AS','Arizona': 'AZ','Arkansas': 'AR','California': 'CA',
    'Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','District Of Columbia': 'DC',    'Florida': 'FL',
    'Georgia': 'GA',    'Guam': 'GU',    'Hawaii': 'HI',    'Idaho': 'ID',    'Illinois': 'IL',    'Indiana': 'IN',
    'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD','Massachusetts': 'MA',
    'Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE',
    'Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC',
    'North Dakota': 'ND','Northern Mariana Islands':'MP','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA',
    'Puerto Rico': 'PR','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX',
    'Utah': 'UT','Vermont': 'VT','Virgin Islands': 'VI','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV',
    'Wisconsin': 'WI','Wyoming': 'WY'
}

In [None]:
fig, axes = plt.subplots(figsize=(15,4), nrows=1, ncols=2)
districts_data['state'].value_counts().plot.bar(ax=axes[0], title="state")
districts_data['locale'].value_counts().plot.bar(ax=axes[1], title="districts locale")

##### Most of schools are in Connecticut, Utah, Massachusetts, Illinois, California, Ohio, etc, states, and
##### Most of schools are in Suburbs, followed by Rular, City and lastly in Town

#### Relationship between 'state' and 'locale'

In [None]:
# Mean Values
fig, ax = plt.subplots(2, 2, figsize=(15, 15))
sns.scatterplot(y='state', x='locale', hue="pct_black/hispanic",
                data=districts_data, ax=ax[0][0], palette='magma')
sns.scatterplot(y='state', x='locale', hue="pct_free/reduced",
                data=districts_data, ax=ax[0][1], palette='magma')

#  SE Values
sns.scatterplot(y='state', x='locale', hue="county_connections_ratio",
                data=districts_data, ax=ax[1][0], palette='magma')
sns.scatterplot(y='state', x='locale', hue='pp_total_raw',
                data=districts_data, ax=ax[1][1], palette='magma')

print(" KEY:")
print(" (1,1)--> Distribution of pct_black/hispanic, (1,2)--> Distribution of pct_free/reduced")
print(" (2,1)--> Distribution of county_connections_ratio, (2,2)--> Distribution of pp_total_raw")

#### Summary:

In [None]:
data_to_submit = pd.DataFrame()
data_to_submit.to_csv('districts_data.csv', index = False)