In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob

%matplotlib inline

### First we import all the relevant libraries this is my first Analytics challenge on kaggle so this notebook will keep changing as I add new things

#### I first need to get a feel of the various dataset and the meaning of their columns

In [None]:
products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
products.sample(10)

In [None]:
products.shape

In [None]:
products.info()

#### Products seems to contain information about the tools used for digital learning we'll have to do processing sector and primary essential function column

In [None]:
districts = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
districts.sample(10)

In [None]:
districts.shape

In [None]:
districts.info()

### Districts contains information about the various districts

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

In [None]:
engagement_df.shape

In [None]:
engagement_df.info()

### The engagement file contains information about engagement of each tool for each day per school district

### the three dataframes can be joined using district_id and lp_id columns

### before preprocessing we first check for missing values

In [None]:
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # dtype of missing values
    mis_val_dtype = df.dtypes

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent, mis_val_dtype], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values', 2: 'Dtype'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [None]:
# check the percentage of missing data in each column to determine whether we will impute or drop

percentage_missing_products = missing_values_table(products)
percentage_missing_products

#### we drop the rows with null values since the percentage is so small

In [None]:
products.dropna(subset=['Sector(s)', 'Primary Essential Function','Provider/Company Name'],inplace=True)

In [None]:
# check the percentage of missing data in each column to determine whether we will impute or drop

percentage_missing_districts = missing_values_table(districts)
percentage_missing_districts

### we will drop the rows districts with missing states and the columnpp_total_raw 

In [None]:
districts.dropna(subset=['state','pct_free/reduced','county_connections_ratio'],inplace=True)
del districts['pp_total_raw']

In [None]:
# check the percentage of missing data in each column to determine whether we will impute or drop

percentage_missing_engagement_df = missing_values_table(engagement_df)
percentage_missing_engagement_df

### we will drop data that has a missing engagement index

In [None]:
engagement_df.dropna(subset=['engagement_index','pct_access','lp_id'],inplace=True)

## Preprocessing 

district id needs to be converted to float
we'll also one hot encode sectors column and split primary essential function

In [None]:
import re

temp_sectors = products['Sector(s)'].str.get_dummies(sep="; ")
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
products = products.join(temp_sectors)
products.drop("Sector(s)", axis=1, inplace=True)

del temp_sectors



In [None]:
products['primary_function_main'] = products['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products['primary_function_sub'] = products['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products['primary_function_sub'] = products['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products.drop("Primary Essential Function", axis=1, inplace=True)

In [None]:
engagement_df['district_id'] = engagement_df['district_id'].astype(int)

let's also rename this column so we can be able to merge later

In [None]:
products.rename(columns={'LP ID': 'lp_id'}, inplace=True)

## We'll now combine the three dataframes so we can explore them

In [None]:
df = pd.merge(engagement_df,districts,how='inner',on='district_id')
df.head()

In [None]:
df = pd.merge(df,products,how='inner',on='lp_id')

## We now have a new dataframe

In [None]:
df.head()

### we dropped quite a lot of values due to nan values but we'll continue regardless we can come back later and impute values after initial analysis

In [None]:
df.shape

## We'll begin exploration now that our data is in a good format

### let's look at the distribution of districts first

In [None]:
#group by state
plt.figure(figsize=(10,10))
_ = sns.countplot(y='state', data=df, order=df.state.value_counts().index)
plt.xlabel('count of number')
plt.title('The number of Districts group by state',fontsize=20)

In [None]:
#group by locale
group = df.groupby('locale').count()
_ = sns.barplot(x=group.index, y=group.district_id)
plt.ylabel('count of number')
plt.title('The number of Districts per locale',fontsize=12)

In [None]:
#group by pct black/hispanic
group = df.groupby('pct_black/hispanic').count()
_ = sns.barplot(x=group.index, y=group.district_id)
plt.ylabel('count of number')
plt.title('The number of Districts per pct_black/hispanic',fontsize=12)

In [None]:
#group by pct black/hispanic
group = df.groupby('pct_free/reduced').count()
_ = sns.barplot(x=group.index, y=group.district_id)
plt.ylabel('count of number')
plt.title('The number of Districts per pct_free/reduced',fontsize=12)

In [None]:
#group by state
plt.figure(figsize=(10,10))
_ = sns.countplot(y='Provider/Company Name', data=df, order=df['Provider/Company Name'].value_counts().index)
plt.xlabel('count of number')
plt.title('The number of Districts group by state',fontsize=20)

In [None]:
group = districts.groupby('county_connections_ratio').count()
_ = sns.barplot(x=group.index, y=group.district_id)
plt.ylabel('count of number')
plt.title('The number of Districts group by county_connections_ratio',fontsize=12)

### what is the state of digital learning in 2020?