In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rcParams
import plotly.express as px
import plotly.io as pio
from ipywidgets import Dropdown, Button, VBox, HBox, Output
from IPython.display import clear_output, display 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import cufflinks as cf
import plotly.offline

In [None]:
params = {'legend.fontsize': 10,
         'axes.labelsize': 16,
         'axes.titlesize':16,
         'xtick.labelsize':12,
         'ytick.labelsize':12}
rcParams.update(params)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

pio.templates.default = "plotly_white"
px.defaults.color_continuous_scale = px.colors.sequential.Blackbody


cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
base_address = '../input/learnplatform-covid19-impact-on-digital-learning'

# District Info


- district_id: unique identifier for a district in a state in US
- state: state to which a district belongs to
- locale: kind of location to which the district would belong to
    - suburb, town, rural, city
- pct_black/hispanic: percentage of students in a districts who are black or hispanic
- pct_free/reduced: percentage of students elligible for free or reduced lunch
- countyconnectionsratio: ratio of the county residents with high speed internet connection(>=200kbps)
- pptotalraw: perpupill total expenditure (local + federal)
    

In [None]:
parse_range_feature = lambda x: round(0.5*(float(x.split(',')[0][1:]) + float(x.split(',')[1][:-1])),1) if not pd.isnull(x) else x

df_district = pd.read_csv(os.path.join(base_address, 'districts_info.csv'))
print('Total number of rows', df_district.shape[0])

df_district['state'] = df_district['state'].astype('category')
df_district['locale'] = df_district['locale'].astype('category')

# parse range features to get mean stat
for col in ['pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio', 'pp_total_raw']:
    df_district[col] = df_district[col].apply(parse_range_feature)


for col in ['district_id', 'state', 'locale']:
    print(f'Number of unique {col}', df_district[col].nunique())

print()
print('Null values column wise distribution')
print(pd.isnull(df_district).sum().to_dict())
print('Number of non nan, rows', df_district.dropna(how='any', axis=0).shape[0])

print()
print('State wise non nan rows', df_district.dropna(how='any', axis=0).groupby('state')['district_id'].count().to_dict())

print()
print('Locale wise non nan rows', df_district.dropna(how='any', axis=0).groupby('locale')['district_id'].count().to_dict())


In [None]:
# distribution if each of the range features, statewise and local wise distribution as box plots or histogram with iplot
# scatter plot for combination of range feature
# heat map with all range feature
# county connection is a useless field provides no information

In [None]:
figure = make_subplots(rows=1, cols=3) 
figure.add_trace(
    go.Scatter( x=df_district['pct_black/hispanic'], y=df_district['pct_free/reduced'], mode='markers', 
               name='minority vs free'), row=1, col=1)
figure.add_trace(
    go.Scatter( x=df_district['pct_black/hispanic'], y=df_district['pp_total_raw'], mode='markers', 
               name='minority vs investment'), row=1, col=2)
figure.add_trace(
    go.Scatter( x=df_district['pct_free/reduced'], y=df_district['pp_total_raw'], mode='markers',
              name='investment vs free'), row=1, col=3)




In [None]:
px.parallel_coordinates(df_district,dimensions=['pct_black/hispanic','pct_free/reduced', 'pp_total_raw'])

- There is some correlation between minority population and free lunches
- regions with free luncbes seems to have the lowest investment
- regions with higher minority population has lower investment

### Locale

In [None]:
import itertools
district_columns = ['pct_black/hispanic', 'pct_free/reduced', 'pp_total_raw']

In [None]:
df_district[district_columns].corr()

In [None]:
fig = make_subplots(rows=1, cols=3) 

for i, (col1, col2) in enumerate(itertools.combinations(district_columns, 2)):
    for data in px.scatter(df_district[(~pd.isnull(df_district[col1])) & (~pd.isnull(df_district[col2]))], 
               x=col1, y=col2, color='locale')['data']:
        fig.add_trace(data, row=1, col=i+1)
        fig.update_xaxes(title_text=col1, row=1, col=i+1)
        fig.update_yaxes(title_text=col2, row=1, col=i+1)
fig

- Rural and suburbs show a clear positive correlation between free lunches and minority population , Also negative relationship between minority population and investment
- too little data to make any conculsion for rural and town areas

In [None]:
print(dict( enumerate(df_district['locale'].cat.categories ) ))
px.parallel_coordinates(df_district,dimensions=['pct_black/hispanic','pct_free/reduced', 'pp_total_raw'], 
                        color=df_district['locale'].cat.codes)

## State

In [None]:
df_district.groupby('state')['district_id'].count().sort_values()[::-1].iplot(kind='bar', orientation='v',
                                                                             yTitle='District Count')
df_district.dropna(how='any', axis=0).groupby('state')['district_id'].count().sort_values()[::-1].iplot(kind='bar', orientation='v',
                                                                             yTitle='District Count with all data')

In [None]:
df_district.groupby('state')[district_columns].mean().iplot(kind='bar', secondary_y='pp_total_raw')

# Product

In [None]:
df_product = pd.read_csv(os.path.join(base_address, 'products_info.csv'))
df_product.rename(columns={'LP ID': 'lp_id'}, inplace=True)
df_product['function'] = df_product['Primary Essential Function'].apply(
    lambda x: x.split('-')[0] if not pd.isnull(x) else x)
print('shape', df_product.shape)

for col in df_product.columns:
    print(f'Number of unique {col}', df_product[col].nunique(), '  ;',
          'Number of null values', pd.isnull(df_product[col]).sum())

In [None]:
df_product.groupby('function')['lp_id'].count().iplot(kind='bar', theme='white')

In [None]:
print('Top 10 service providers')
df_product.groupby('Provider/Company Name')['lp_id'].count().sort_values()[::-1].head(10)

- Majority of the products of LC products

# Engagement

## Sample Location

In [None]:
from glob import glob

list_df = []
for x in glob(os.path.join(base_address, 'engagement_data/*.csv' ))[:1]:
    df_engage = pd.read_csv(x)
    df_engage['district_id'] = os.path.splitext(os.path.basename(x))[0]
    list_df.append(df_engage)
df_engage = pd.concat(list_df)
df_engage = df_engage.set_index('lp_id', drop=True).join(
    df_product.set_index('lp_id', drop=True), how='inner').reset_index()
df_engage.set_index('time', inplace=True, drop=True)
df_engage.index = pd.to_datetime(df_engage.index)

print('shape', df_engage.shape)
print('time frame', df_engage.index.min(), df_engage.index.max())
print(df_engage[['engagement_index', 'pct_access']].describe())

In [None]:
#df_engage.iplot(kind='line', y='engagement_index', colors='lp_id')
# is the uniformly sampled on a daily basis
#cf.help('line')

In [None]:
df_engage_ = df_engage.reset_index().groupby(['time', 'Sector(s)']).mean()[['engagement_index', 'pct_access']].unstack(
    level=1)
df_engage_.columns = ['_'.join(x) for x in df_engage_.columns]
secondary_y = [x for x in df_engage_.columns if 'pct_access' in x]
df_engage_.iplot(kind='line', secondary_y=secondary_y, theme='white', yTitle='Average Engagement Index',
                secondary_y_title ='Access %', legend='bottom', title='Sectorwise Average Engagement')

In [None]:

# number of lpids for each district_id
#df_engage.groupby(['district_id'])['lp_id'].nunique().describe
df_engage.groupby(level=0).mean()[['engagement_index', 'pct_access']].iplot(kind='line', secondary_y='pct_access',
                                                                           theme='white', yTitle='Average Engagement Index',
                                                                            secondary_y_title ='Access %', 
                                                                            legend='top', 
                                                                            title='Average Engagement')

In [None]:
df_engage.groupby(pd.Grouper(freq='M')).mean()[['engagement_index', 'pct_access']].iplot(
    kind='line', secondary_y='pct_access',theme='white',yTitle='Average Engagement Index',
                secondary_y_title ='Access %', legend='bottom', title='Monthly Average Engagement')

df_engage.groupby(pd.Grouper(freq='M')).mean()[['engagement_index', 'pct_access']].sort_index().diff().iplot(
    kind='line', secondary_y='pct_access',theme='white',yTitle='Average Engagement Index',
                secondary_y_title ='Access %', legend='bottom', title='Monthly Average Engagement Diff')

In [None]:
df_engage.groupby(pd.Grouper(freq='M')).sum()[['engagement_index', 'pct_access']].iplot(
    kind='line', secondary_y='pct_access',theme='white', yTitle='Total Engagement Index',
                secondary_y_title ='Access %', legend='bottom', title='Monthly Total Engagement')

- average engagement index and pct_access show direct relationship
- second half of 2020 has higher engagement 
    - covid influencedue to strict lockdown and schools being shutdown
- daily pattern can be observed for engagement_index and pct_access
- holidays have an impact on the metrics
*****
- do exam periods have an impact on the metric?

In [None]:
px.scatter(df_engage, x='pct_access', y='engagement_index', log_y=True)

- Logarithmic relationship between engagement index and the access percentage specially at the lower regions

## Feature Extraction for a district and product

In [None]:
"""col = 'engagement_index'
df_engage['q95'] = df_engage.groupby('lp_id')[col].transform(lambda x: x.quantile(.95))    
df_engage['q05'] = df_engage.groupby('lp_id')[col].transform(lambda x: x.quantile(.05))
df_engage = df_engage[(df_engage[col] > df_engage['q05']) & (df_engage[col] < df_engage['q95'])]
del df_engage['q95']
del df_engage['q05']"""

In [None]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
def engage_feature_extraction(address):
    df_engage = pd.read_csv(address)
    df_engage = df_engage.groupby(['lp_id', 'time']).first().reset_index()

    df_engage = df_engage.groupby(['lp_id'])[['engagement_index', 'pct_access']].agg([np.nansum, np.nanmean, np.nanmedian, 
                                                                                      np.std, np.nanmax])
    df_engage = df_engage.dropna(how='any', axis=0)
    if len(df_engage) == 0:
        return pd.DataFrame()
    df_engage.columns = ['_'.join(x) for x in df_engage.columns]
   
    df_engage = df_engage.join(
        df_product.set_index('lp_id', drop=True), how='inner').reset_index()
    
    df_engage['district_id'] = int(os.path.splitext(os.path.basename(address))[0])
    
    df_engage = df_engage.set_index('district_id', drop=True).join(
    df_district.set_index('district_id', drop=True), how='inner').reset_index()
    
    df_engage['district_id'] = df_engage['district_id'].astype('category')
    df_engage['lp_id'] = df_engage['lp_id'].astype('category')
    return df_engage

In [None]:
df_engage_features = pd.concat([engage_feature_extraction(x) 
    for x in tqdm(glob(os.path.join(base_address, 'engagement_data/*.csv' )))]).reset_index(drop=True)

print('shape', df_engage_features.shape)
df_engage_features['n_products_per_district'] = df_engage_features.groupby('district_id')['lp_id'].transform('nunique')

In [None]:
# utilization of products in districts
df_product_utilization = df_engage_features.groupby(['lp_id'])['district_id'].nunique()
df_product_utilization.sort_values()[::-1].head(10)
df_product_utilization.iplot(kind='hist', bins=100, title='In how many districts a product is being used?',
                            )

- Few outlier products an be seen from the above distribution, what kind of products are present are these?

## District Level Relationships

In [None]:
df_product_utilization = df_engage_features.groupby(['district_id'])['lp_id'].nunique()

df_product_utilization.iplot(kind='hist', bins=100, title='Number of products being used in a district',)
#df_product_utilization.sort_values()[::-1].head(10)

In [None]:
df_engage_features.describe()

In [None]:
df_district_engage = df_engage_features.groupby(['district_id']).agg(
    {'lp_id':'nunique', 'engagement_index_nanmedian':'mean', 'pct_access_nanmedian': 'mean', 'pct_black/hispanic':'mean',
    'pct_free/reduced':'mean', 'pp_total_raw':'mean', 'locale': 'first'})
for col in district_columns:
    df_district_engage[col] = round(df_district_engage[col], 1)

df_district_engage = df_district_engage[
    df_district_engage['engagement_index_nanmedian'] <df_district_engage['engagement_index_nanmedian'].quantile(.95)]
df_district_engage = df_district_engage[
    df_district_engage['engagement_index_nanmedian'] >df_district_engage['engagement_index_nanmedian'].quantile(.02)]
df_district_engage.corr()

In [None]:
df_district_engage.groupby('pct_black/hispanic').mean()

In [None]:
df_district_engage.groupby('pct_free/reduced').mean()

- engagement is negatively correlated with minority and reduced prices and positively correlated with expenditure

In [None]:
fig = make_subplots(rows=1, cols=3) 

for i, (col1, col2) in enumerate([('engagement_index_nanmedian', x) for x in district_columns]):
    for data in px.scatter(
        df_district_engage[(~pd.isnull(df_district_engage[col1])) & (~pd.isnull(df_district_engage[col2]))], 
         x=col1, y=col2, color='locale')['data']:
        fig.add_trace(data, row=1, col=i+1)
        fig.update_xaxes(title_text=col1, row=1, col=i+1)
        fig.update_yaxes(title_text=col2, row=1, col=i+1)
fig

In [None]:
fig = make_subplots(rows=1, cols=3) 

for i, (col1, col2) in enumerate([('engagement_index_nanmedian', x) for x in district_columns]):
    
    fig = px.parallel_coordinates(
        df_district_engage[(~pd.isnull(df_district_engage[col1])) & (~pd.isnull(df_district_engage[col2]))], 
                                 dimensions=[col1, col2])

    fig.show()

In [None]:
px.parallel_coordinates(df_district_engage, 
                        #dimensions = ['engagement_index_nanmedian', 'pct_black/hispanic', 'pct_access_nanmedian']
                       )

- Rural and suburbs show positive relationship with engagement and investment, too little data for city and town to make a decision
- Scatter plots are a bit inconclusive for minority relation, but tree plots confirm that minority population and engagement are inversly related, which is also confirmed with correlation value

### Product Ranking

In [None]:
def get_ranking(data):
    return data.sort_values('engagement_index_nanmedian')[::-1].iloc[:10]

for i, group in df_engage_features.groupby(['district_id', 'lp_id']):
    break

#df_district_produt_ranking = pd.concat([get_ranking(group) 
                                        #])
    

In [None]:
def add_meta_data_to_rank(data):
    data = df_topk_products.join(df_engage_features.groupby('lp_id')['engagement_index_nanmedian'].mean(), 
                                         how='left')
    data = df_topk_products.join(df_product.set_index('lp_id', drop=True), how='left')

In [None]:
df_engage_features['used_in_district'] = df_engage_features.groupby('lp_id')['district_id'].transform('nunique')
list_df = []
for i, group in df_engage_features.groupby('district_id'):
    group = group.sort_values('engagement_index_nanmedian', ascending=False)
    group['rank_engagement'] =range(1, len(group)+1)
    group
    list_df.append(group)
df_product_ranking = pd.concat(list_df).reset_index(drop=True).set_index('lp_id', drop=True)

#df_product_ranking = df_product_ranking.join(df_product.set_index('lp_id', drop=True), how='left').reset_index()

In [None]:
df_product_ranking.groupby('Product Name')[['rank_engagement', 'engagement_index_nanmedian']].mean().sort_values(
    'rank_engagement').head(20)


In [None]:
df_district_product_ranking=df_engage_features.groupby(['district_id']).apply(
    lambda x: x.sort_values('engagement_index_nanmedian', ascending=False).iloc[:10])


df_topk_products = df_district_product_ranking.groupby('lp_id')[['district_id']].count().sort_values(by='district_id')[::-1]
df_topk_products = df_topk_products.join(df_engage_features.groupby('lp_id')['engagement_index_nanmedian'].mean(), 
                                         how='left')
df_topk_products = df_topk_products.join(df_product.set_index('lp_id', drop=True), how='left')
df_topk_products['rank'] = range(1, len(df_topk_products)+1)



In [None]:
from IPython.core.display import display, HTML
from IPython.display import display_html 

def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        #output += "\xa0\xa0\xa0"
    display_html(output,raw=True)

In [None]:
list_df = []
captions = []
for col in ['Provider/Company Name', 'Product Name', 'Sector(s)', 'Primary Essential Function', 'function']:
    list_df.append(df_topk_products.groupby(col)[['district_id']].count().sort_values('district_id')[::-1].head(10).join(
        df_product.groupby(col)['lp_id'].count(), how='left'))
    captions =[f'topk={df_topk_products.shape[0]}']
display_side_by_side(list_df, captions)

In [None]:

for col in ['Provider/Company Name', 'Product Name', 'Sector(s)', 'Primary Essential Function', 'function']:
    print('\n',f'Top 10 {col}', 
      round(df_topk_products.groupby(col)['district_id'].count().sort_values()[::-1].head(10)/118, 2).to_dict())



In [None]:
df_topk_products

In [None]:
df_engage_features

- Topk learning tools for each district and how many of them are shared based on voting
- topk among minority dominated regions and their average engagement index and compare them with non minority regions
- relationship between minority population and engagement index and the spending