In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from statistics import mean
import matplotlib.pyplot as plt

### Build class for data preprocessing

In [None]:
class GetDfForPreprocessing:
    """
    this function will prepare data form dataframe for preprocessing
    
    Return
    ------
    dataframe
    """
    def __init__(self, df:pd.DataFrame):
        
        self.df = df.copy()
        
    def print_df_info(self) -> None:
        """
        this function will print the info of the datafame

        Return
        ------
        None
        """
        print('Retrieving info from data...')
        #save the number of columns and names
        col_info = 'The number of colum(s): {}.\nThe column(s) is/are : {} and {}\n'.format(len(self.df.columns),', '.join(self.df.columns[:-2]), self.df.columns[-1])  
        
        #save the number of rows
        num_rows = "\nThe total number of rows: {}".format(len(self.df))
        
        na_cols = list(self.df.columns[self.df.isnull().any()])
        
        #save the number of missing values
        num_na_cols = "\nThe number of columns having missing value(s): {}".format(len(na_cols))
        
        #save the columns with missing value and the num of values missing
        na_cols_num_na = ''
        
        na_col_val_dict = {}
        for col in na_cols:
            missing_vals = self.df[col].isnull().sum()
            na_col_val_dict[col] = missing_vals
            na_cols_num_na += "\nThe number of rows with missing value(s) in [{}]: {}".format(col, missing_vals)
        
        # save the total number of missing values
        tot_na = "\nThe total number of missing value(s): {}".format(self.df.isnull().sum().sum())
        
        self.na_cols = na_cols
        self.na_col_val_dict = na_col_val_dict
        
        print(col_info, num_rows, num_na_cols, na_cols_num_na)
        
        
    def drop_cols_abv_na_trshld(self, threshold:float, exclude=[], output=False) -> pd.DataFrame:
        """
        this function will drop columns with missing values above a specified threshold

        Return
        ------
        dataframe
        """
        print('\nComparing threshold with fraction of missing values ...')
        df = self.df.copy()
        try:
            if self.na_col_val_dict:
                na_col_val_dict = self.na_col_val_dict
                na_cols = self.na_cols
        except:
            na_cols = df.columns[df.isnull().any()]
            na_col_val_dict = {}
            for col in na_cols:
                missing_vals = df[col].isnull().sum()
                na_col_val_dict[col] = missing_vals
            
        tot_entries = len(df)
        above_treshold = []
        
        print('\nRetrieving columns to be dropped ...')
        for col in na_cols:
            if na_col_val_dict[col] > threshold * tot_entries:
                above_treshold.append(col)
                
        print('\nColumns to be dropped :', above_treshold)
        print('\nThe column(s) to be excluded is/are {}'.format([exclude]))
        if len(exclude)>0:
            for col in exclude:
                above_treshold.remove(col)
                
        print('\nDropping columns with missing values above the threshold ...') 
        df.drop(above_treshold, axis=1, inplace=True)
        print('\nDropping columns completed')

        print('\nRemoving dropped columns from memory...')
        for col in above_treshold:
            na_cols.remove(col)
            del na_col_val_dict[col]
        print('\nRemoval of dropped columns from memory completed')

        self.na_col_val_dict = na_col_val_dict
        self.na_cols = na_cols
        self.df = df.copy()
        if output:
            return df

    def fill_missing(self, exclude=['CompetitionOpenSinceMonth'], method={'num':'mean'}):
        df = self.df.copy()
        na_col_val_dict = self.na_col_val_dict
        na_cols = self.na_cols
        print('\nThe colums with missing values to be filled are {}'.format(na_cols))
        print('\nThe column(s) to be excluded is/are {}'.format(exclude))
        
        if len(exclude)>0:
            for col in exclude:
                na_cols.remove(col)

        for col in na_cols:
            print('\nFilling missing values in {}'.format(col))
            if(df[col].dtype == np.float64 or df[col].dtype == np.int64):
                if method['num'] == 'mean':
                    df[col].fillna(df[col].mean(), inplace=True)
                else:
                    df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(df[col].mode()[0], inplace=True)

        print('\nFilling missing values comppleted')
        return df


### Read Data

In [None]:
dist_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
prod_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
dist_df.head()

In [None]:
prod_df.head()

### Process District data

In [None]:
prep_dist = GetDfForPreprocessing(dist_df)
prep_dist.print_df_info()

### Get insight about states

In [None]:
a = list(dist_df['state'].unique())
len(a)

In [None]:
dist_df['state'].mode()

There are 23 states in the district data frame. The most occuring state is Connecticut

### Drop null values in ['state'] column

In [None]:
dist_st_notna = dist_df[dist_df['state'].notna()].copy()

In [None]:
prep_ = GetDfForPreprocessing(dist_st_notna)
prep_.print_df_info()

### Drop clumn with missing values more than 30% of the total rows

In [None]:
prep_.drop_cols_abv_na_trshld(threshold=0.3)

### Fill missing values with the mode of each column

In [None]:
dist_clean_df = prep_.fill_missing(exclude=[])

In [None]:
dist_clean_df.head()

In [None]:
dist_clean_df['locale'].unique()

### Clean District data

In [None]:
# define function to get average of range inputs
def avg(entry):
    return mean([float(i) for i in entry.strip('[').split(',')])

In [None]:
rnge_cols = dist_clean_df.columns[3:]  # retrieve columns with range values

for col in rnge_cols:
    dist_clean_df[col] = dist_clean_df[col].apply(avg) # find the average of the reanges
dist_clean_df.reset_index(drop=True, inplace=True) # reset the index

In [None]:
dist_clean_df.head()

### Extract info from ['Primary Essential Function'] column

In [None]:
from collections import defaultdict

p_esn_funct = defaultdict(set)
for e in prod_df['Primary Essential Function'].unique():
    try:
        temp = e.split(' - ')
        p_esn_funct[temp[0]].add(temp[1])
    except:
        pass

for key in p_esn_funct.keys():
    prod_df[key] = 0
    for i in range(len(prod_df)):
        try:
            if key in prod_df['Primary Essential Function'][i]:
                prod_df[key][i] = 1
                
        except:
            pass

sectors = set()
for e in prod_df['Sector(s)'].unique():
    try:
        if ';' in e:
            temp = e.split('; ')
            for i in temp:
                sectors.add(i)
        else:
            sectors.add(i)
    except:
        pass

for key in list(sectors):
    prod_df[key] = 0
    for i in range(len(prod_df)):
        try:
            if key in prod_df['Sector(s)'][i]:
                prod_df[key][i] = 1
                
        except:
            pass


### Drop null values in product data

In [None]:
prod_df.dropna(inplace=True)

### Create A Dictionary with states as keys and DataFrames as values

In [None]:
# get the filter to be used to extract data based on states
filters_dict = {}
for st in dist_clean_df['state']:
    flt = dist_clean_df['state'] == st
    filters_dict[st] = flt

In [None]:
# get the dictionary with data based on states
State_df = {k:dist_clean_df[filters_dict[k]].copy() for k in filters_dict.keys()}

In [None]:
# Integrate data from engagement with district data
State_df_full ={}   # to clear memory usage                            
for key in State_df.keys():
    df = State_df[key]
    for d_id, idx in zip(df['district_id'].values, df.index):
        temp_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+ str(d_id) +'.csv', parse_dates=['time'])
        temp_df['district_id'] = d_id
        temp_df['month'] = temp_df['time'].apply(lambda x:x.month)
        temp_df['weekday'] = temp_df['time'].apply(lambda x:x.weekday() + 1)
        temp_df['locale'] = df['locale'][idx]
        temp_df['state'] = key
        temp_df['pct_black/hispanic'] = df['pct_black/hispanic'][idx]
        temp_df['pct_free/reduced'] = df['pct_free/reduced'][idx]
        temp_df['county_connections_ratio'] = df['county_connections_ratio'][idx]
        temp_df.dropna(inplace=True)
        if key in State_df_full.keys():
            State_df_full[key] = pd.concat([temp_df,State_df_full[key]],ignore_index=True)
        else:
            State_df_full[key] = temp_df

In [None]:
# Integrate data from products with district and engagement data
State_df={}    # to clear memory usage
State_df_full_prod = {}
for key in State_df_full.keys():
    State_df_full_prod[key] = State_df_full[key].merge(prod_df,left_on='lp_id',right_on='LP ID')

In [None]:
# get the state and the locales('suburb','ciites',...) in each state {state:[locales]}
state_locale = {key:State_df_full_prod[key]['locale'].unique() for key in State_df_full_prod}

### Data has been processed and merged in a way to enable analysis on monthly, daily and weekly basis. It is also in a format where analysis can be made on state by state with breakdown for locales in states. This will enable analyst to link performance of states in chosen parameters and recommend those that worked and those that need little adjustments. The data is saed in dictionaries with keys being names of sates and values being DataFrames of data specific to the name of the state being the key.

### Visualize trends in product type usuage by states

In [None]:
slice1=['LC','CM','SDO','LC/CM/SDO']

In [None]:
def plot_viz(state, slice_col, about=''):
    dfs = State_df_full_prod[state]
    fig, ax = plt.subplots(figsize=(15, 10))
    plt.rcParams.update({'axes.titlesize': 'Large'})

    ax.tick_params(axis='x', labelsize=16)
    ax.tick_params(axis='y', labelsize=16)
    ax.set_title('Trends In '+ about + ' In ' + state, fontsize=22)
    ax.grid(color='black', linestyle='--', linewidth=0.1)

    dfs.groupby('time').sum()[slice_col].plot(ax=ax, grid=True)

    locales = state_locale[state]

    for locale in locales:
        fig1, ax1 = plt.subplots(figsize=(15, 10))
        plt.rcParams.update({'axes.titlesize': 'Large'})

        ax1.tick_params(axis='x', labelsize=16)
        ax1.tick_params(axis='y', labelsize=16)
        
        if locale == 'City':
            ax1.set_title('Trends In ' + about + ' In ' + 'Cities In ' + state, fontsize=22)
        else:
            ax1.set_title('Trends In ' + about + ' In ' + locale + 's In ' + state, fontsize=22)
        
        ax1.grid(color='black', linestyle='--', linewidth=0.1)

        mask = dfs['locale'] == locale
        dfs[mask].groupby('time').sum()[slice_col].plot(ax=ax1, grid=True)

In [None]:
def plot_top():
    pass

In [None]:
def plot_top(state, slice_col,top=10, about=''):
    dfs = State_df_full_prod[state]
    fig, ax = plt.subplots(figsize=(15, 10))
    plt.rcParams.update({'axes.titlesize': 'Large'})

    ax.tick_params(axis='x', labelsize=16)
    ax.tick_params(axis='y', labelsize=16)
    ax.set_title('Trends '+ about + ' In ' + state, fontsize=22)
    ax.grid(color='black', linestyle='--', linewidth=0.1)

    dfs[slice_col].value_counts().head(top).plot(grid=True, kind='bar', ax=ax)

    locales = state_locale[state]

    for locale in locales:
        fig1, ax1 = plt.subplots(figsize=(15, 10))
        plt.rcParams.update({'axes.titlesize': 'Large'})

        ax1.tick_params(axis='x', labelsize=16)
        ax1.tick_params(axis='y', labelsize=16)
        
        if locale == 'City':
            ax1.set_title('Trends In ' + about + ' In ' + 'Cities In ' + state, fontsize=22)
        else:
            ax1.set_title('Trends In ' + about + ' In ' + locale + 's In ' + state, fontsize=22)
        
        ax1.grid(color='black', linestyle='--', linewidth=0.1)

        mask = dfs['locale'] == locale
        dfs[mask][slice_col].value_counts().head(top).plot(ax=ax1, grid=True, kind='bar')

In [None]:
states = list(state_locale.keys())

In [None]:
len(states)

### Initial questions :
**1. What are the trends in the type of product used, engagements, top products, top companies?**

**2. What are the trends in the various locales of each state?**

**3. What are the trends in the states taking all locales?**

### Products are labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations

In [None]:
plot_viz(state=states[0], slice_col=slice1, about='Product Type Usage')

In [None]:
plot_viz(state=states[0], slice_col=['engagement_index'], about='Engagement')

In [None]:
plot_top(state=states[0], slice_col=['Product Name'],top=10, about='Specific Product Usage')

In [None]:
plot_top(state=states[0], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

### engagement_index : sum of Total page-load events per one thousand students of a given product and on a given day
### Products are labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations

## State of Illinois
The LC product type was the most used in the state. Between the periods of January and the end of February the usage for LC products alternated between 1500 and 2100. This period precedes the closure of schools during the spring (March, April, May).
During the Spring Period there was an increment in usage relative to the period from January to the end of February. The usage ranged between 1500 and 3000 with April recording usage between 1800 and 2200.
There was a declined in usage from June to August. This period recorded the lowest usage all year with values ranging between 900 and 1200. August to December saw a rise in usage. There was a steep rise from August to September. The highest usage values were recorded between August and December.
The usage of CM and SDO product types were generally at par. The trends mirrored that of the LC types but with a lower usage. The usage for the two did not exceed 500.
SDO type was having slightly more usage than CM from January to march. CM usage caught up with SDO usage from April to September beyond which it gained slightly more usage than SDO.
### Locales
The plot of the **Suburbs** mirrors the plot for the state. This indicates that the suburb districts are the major driving force of the usage in the state. The usage of LC peaks at 2700. That of CM and SDO both peaks at an approximate value of 400
The **rural** districts also mirror the trends of the state in general but have LC usage peaking at 600 and both CM and SDO at approximately 100
The trends in the **town** districts are a little bit different. Data for the usage starts from March 2020 to January 2021. The usage difference between LC and the other types is much closer than what is observed in the other locales. The profile (horizontal trends) follows that of the state. The usage of LC peaks at approximately 75. That of CM and SDO both peaks at approximately 10.

## The covid effect
From the trends before the closure and during the closure of schools indicates increment in the use of products during the closure period. As expected, Covid increased the usage of products in the state of Illinois and further implies greater reach through products.

In [None]:
plot_viz(state=states[1], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[1], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[1], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[1], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

## State of Utah
LC products were the most used in the state. There was a steady rise in usage from January to April. Usage ranged between 800 and 3600 from January to April. March, Aprol and May also recorded a steady rise with localised decline in April. The usage in this period ranged between 1800 and 3900.
There was a steady decline from May to June and a very steep decline from May to June. The lowest usage (700)   was recorded from July to August. The usage remained stable at low values from July to August with values ranging between 700 and 1100. 
A steep rise was recorded from August to September and a gentle rise from September to October. There was a drop in usage in the middle of September followed by a rise which led to the highest usage all year in October. In general, the usage from September to December was the highest. 
SDO was the second most used product type from January to April. April to May saw CM catching up with SDO usage and doing slightly better from May to June. Between June to August where the usage of both products declined and remained generally the same. From August to September, there was a rise in usage of SDO and CM. In the remaining months in the year, CM performed better than SDO.
### Locales
There was data for districts in Towns, Cities, Rurals and Suburbs.
The product usage in **cities** followed the trends seen in the state. LC peaked at 500, CM and SDO both peaked at 80. The lowest usage of LC was about 20 which is less than the highest usage of CM and SDO. This trend was not seen in the state.
The product usage in **towns** followed the trends seen in the state. LC peaked at 1100, CM and SDO both peaked at 160. The lowest usage of LC was about 80 which is less than the highest usage of CM and SDO. This trend was not seen in the state.
The product usage in **rurals** followed the trends seen in the state. LC peaked at 275, CM and SDO both peaked at 45. The lowest usage of LC was about 0 which coincided with the lowest usage of CM and SDO. This trend was not seen in the state.
The product usage in **suburbs** followed the trends seen in the state. LC peaked at 2400, CM and SDO both peaked at 350.
## The covid effect
From the trends before the closure and during the closure of schools indicates increment in the use of products during the closure period. As expected, Covid increased the usage of products in the state of Utah and further implies greater reach through products.



In [None]:
plot_viz(state=states[2], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[2], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[2], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[2], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

In [None]:
plot_viz(state=states[3], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[3], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[3], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[3], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

In [None]:
plot_viz(state=states[4], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[4], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[4], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[4], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

In [None]:
plot_viz(state=states[5], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[5], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[5], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[5], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')

In [None]:
plot_viz(state=states[5], slice_col=slice1, about='Product Type Usage') 
plot_viz(state=states[5], slice_col=['engagement_index'], about='Engagement')
plot_top(state=states[5], slice_col=['Product Name'],top=10, about='Specific Product Usage')
plot_top(state=states[5], slice_col=['Provider/Company Name'],top=10, about='Specific Product Providers')