# Import libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load data

In [None]:
# function to load data
def load_data(file_path):
    return pd.read_csv(file_path)

### Districts dataset

In [None]:
districts = load_data('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv') # districts data
districts.head() # return top 5 rows of a data frame

In [None]:
# number of data points for the districts_info dataset
print(f" There are {districts.shape[0]} rows and {districts.shape[1]} columns of districts_info dataset.")

### Products dataset

In [None]:
products = load_data('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv') # products data
products.head()

In [None]:
# number of data points for the districts_info dataset
print(f" There are {products.shape[0]} rows and {products.shape[1]} columns of products_info dataset.")

### Engagement dataset

In [None]:
#load engagement data
def load_engagement(path):
    dataframes = {}
    engagement_data_list = os.listdir(path)
    for data in engagement_data_list:
        engmnt = pd.read_csv(path + data)
        engmnt['district'] = [data.split('.')[0] for _ in range(len(engmnt))]
        dataframes[data.split('.')[0]] = engmnt
    return dataframes

In [None]:
# missing values and data information for engagement dataset
def missing_eng(data):
    missing_data, rows = data[list(data.keys())[0]].isna().sum(), 0
    for key in data.keys():
        rows += len(data[key])
        if key != list(data.keys())[0]:
            missing_data += data[key].isna().sum()
    return missing_data, rows
engagement_data = load_engagement('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/')

# Handling Missing Values

In [None]:
# how many missing values exist or better still what is the % of missing values in the dataset?
def percent_missing(df, d_type):

    # Calculate total number of cells in dataframe
    totalCells = np.product(df.shape)

    # Count number of missing values per column
    missingCount = df.isnull().sum()

    # Calculate total number of missing values
    totalMissing = missingCount.sum()

    # Calculate percentage of missing values
    print(f"The {d_type} dataset contains", round(((totalMissing/totalCells) * 100), 2), "%", "of missing values.")
    
percent_missing(products, 'products_info')
percent_missing(districts, 'districts_info')


In [None]:
# Now which column(s) has missing values for the products_info dataset
pro_msg_info = products.isna().sum()
pro_col = list(pro_msg_info.keys())
pro_msg_info

In [None]:
# calculate the missing value for each columns in the products_info dataset
for pdct in pro_col:
    print(f"Total missing values for {pdct} column is {pro_msg_info[pdct]}")

In [None]:
# Now which column(s) has missing values for the districts_info dataset
dist_msg_info = districts.isna().sum()
dist_col = list(dist_msg_info.keys())
dist_msg_info

In [None]:
# calculate the missing value for the districts_info
for dst in dist_col:
    print(f"Total missing value for {dst} column is {dist_msg_info[dst]}")

In [None]:
# calculate the missing value for the engagement dataset
enge_mssing_data, enge_rows = missing_eng(engagement_data)
print(f"Total number of rows in the engagement dataset are: {enge_rows}.")
print(f'\nMissing value informatin \n {enge_mssing_data}\n')

enge_mssing_d = list(enge_mssing_data.keys())
for eg in enge_mssing_d:
    print(f"Total missing value for {eg} column is {enge_mssing_data[eg]}")

# Visualization

In [None]:
# plot state distribution for districts
def district_count_plot(df, col, title, hue=None):
    plt.figure(figsize=(20, 7))
    sns.countplot(data = df, y=col, hue=hue, order=df[col].value_counts().index)
    plt.title(title, size=20)
    plt.xlabel(col, fontsize=16)
    plt.ylabel("Count", fontsize=16)
    plt.show()
district_count_plot(districts, 'state', 'State Distribution for district')

In [None]:
# plot distribution of schools by locale
def pie_plot(df):
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.axis('equal')
    locale = list(df['locale'])
    num_schools = list(df['district_id'])
    ax.pie(num_schools, labels = locale,autopct='%1.1f%%')
    plt.title("Distribution of schools by locale")
    plt.show()
    
localschool = districts[['district_id', 'locale']].groupby('locale').count().reset_index()
pie_plot(localschool)

In [None]:
# plot distribution of school by  state
school_state = districts[['district_id', 'state']].groupby('state').count().reset_index()
plt.figure(figsize=(12,7))
plt.title("Distribution of schools by state")
sns.barplot(y = school_state['state'], x = school_state['district_id'])
plt.show()

In [None]:
def school_district(df, col, title, hue=None):
    plt.figure(figsize=(20, 7))
    sns.countplot(data = df, y=col, hue=hue, order=df[col].value_counts().index)
    plt.title(title, size=20)
    plt.xlabel(col, fontsize=16)
    plt.ylabel("Count", fontsize=16)
    plt.show()
school_district(districts, 'locale', 'Locale Distribution')

In [None]:
# bar plot for top 10 providers
def top10_bar_plot(df, x_col, y_col, title=''):
    plt.figure(figsize=(20, 7))
    sns.barplot(data = df, x=x_col, y=y_col)
    plt.title(title, size=20)
    plt.xlabel(x_col, fontsize=16)
    plt.ylabel(y_col, fontsize=16)
    plt.show()
result = products['Provider/Company Name'].value_counts()
result = result.head(10)
top_providers = pd.DataFrame({'Company': result.index, 'Count': result})
top10_bar_plot(top_providers, "Count", "Company", title='Top 10 providers')