In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as po
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
import plotly.express as px
import random
import plotly.figure_factory as ff

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration


In [1]:
product_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
district_df =pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
print(product_df.head())
print("\n",district_df.head())
print(product_df.shape)
print(district_df.shape)
print(product_df.columns)
print(district_df.columns)

In [1]:
path = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
all_files = glob.glob(path + "/*.csv")

li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
engagement_df = pd.concat(li, axis=0, ignore_index=True)

In [1]:
print('\033[1m'"Shape of the Engagement File "'\033[0m',engagement_df.shape )
print('\033[1m'"Shape of the District file"'\033[0m', district_df.shape)
print('\033[1m'"Shape of the Product File"'\033[0m',product_df.shape)

In [1]:
engagement_df.head(10).style.set_caption("Engagement Dataframe").set_properties(**{'background-color': 'black','color': 'lawngreen','border': '1.5px  solid white'})

In [1]:
district_df.head(10).style.set_caption("District Dataframe").set_properties(**{'background-color': 'black',
                           'color': 'lawngreen','border': '1.5px  solid white'})

In [1]:
product_df.head(10).style.set_caption("Product Dataframe").set_properties(**{'background-color': 'black',
                           'color': 'lawngreen','border': '1.5px  solid white'})

In [1]:
print('\033[1m'"Data types of each column in district data file\n"'\033[0m',district_df.dtypes)
print('\033[1m'"Data types of each column in product data file\n"'\033[0m',product_df.dtypes)
print('\033[1m'"Data types of each column in engagement data file\n"'\033[0m',engagement_df.dtypes)

# Missing Values treatment

In [1]:
print('\033[1m'"Missing value present in each column of district data file\n"'\033[0m',district_df.isna().any())
print('\033[1m'"Missing value present in each column of product data file\n"'\033[0m',product_df.isna().any())
print('\033[1m'"Missing value present in each column of engagement data file\n"'\033[0m',engagement_df.isna().any())
print('\033[1m'"Missing value count in each column of district data file\n"'\033[0m',district_df.isna().sum())
print('\033[1m'"Missing value count in each column of product data file\n"'\033[0m',product_df.isna().sum())
print('\033[1m'"Missing value count in each column of engagement data file\n"'\033[0m',engagement_df.isna().sum())

# Visualisation 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print(msno.heatmap(district_df,figsize=(10,5)))
print(msno.heatmap(product_df,figsize=(10,5)))
print(msno.heatmap(engagement_df,figsize=(10,5)))


In [1]:
district_df = district_df[district_df['state'].notna()].reset_index(drop=True)


In [1]:
district_df.isna().sum()


In [1]:
fig = ff.create_table(district_df.head(5),height_constant=50)
fig.update_layout(width=3500, height=400)
fig.show()

In [1]:
colorscale = [[0, 'red'],[.5, '#DCE775'],[1, '#C0CA33']]
font=['white', '#212121' , 'red']
fig = ff.create_table(district_df.tail(5),height_constant=50,colorscale=colorscale,font_colors=font)
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font.size = 17
fig.update_layout(width=4500, height=400)
fig.show()

In [1]:
plt.figure(figsize=(10,12))
sns.countplot(y ='state',data = district_df,order=district_df['state'].value_counts().index)
plt.show()

In [1]:
status = ['Connecticut' , 'Utah' , 'Massachusetts' , 'Illinois','California','Ohio','New York','Indiana','Missouri','Washington','Virginia','North Carolina','Wisconsin','District Of Columbia','Texas','New Jersey','New Hampshire','Michigan','Tennessee','Arizona','North Dakota','Florida','Minnesota']
data = go.Pie(
values= district_df["state"].value_counts(),
labels= status,
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
plt.figure(figsize=(10,12))
sns.countplot(x ='locale',data = district_df,order=district_df['locale'].value_counts().index)
plt.show()

In [1]:
status = ['Suburb','Rural','City','Town']
colors = ['#8BC34A','#D4E157','#FFB300','#FF7043']
data = go.Pie(
values= district_df["state"].value_counts(),
labels= status,
marker=dict(colors=colors),
textinfo='label+value+percent'
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
plt.figure(figsize=(10,12))
sns.countplot(x ='pct_black/hispanic',data = district_df,order=district_df['pct_black/hispanic'].value_counts().index)
plt.show()

In [1]:
status = ['[0, 0.2[','[0.2, 0.4[','[0.4, 0.6[','[0.6, 0.8[','[0.8, 1[']
colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd']
data = go.Pie(
values= district_df["pct_black/hispanic"].value_counts(),
labels= status,
marker=dict(colors=colors),
textinfo='label+value+percent'
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
plt.figure(figsize=(10,12))
sns.countplot(x ='pct_free/reduced',data = district_df,order=district_df['pct_free/reduced'].value_counts().index)
plt.show()

In [1]:
status = ['[0, 0.2[','[0.2, 0.4[','[0.4, 0.6[','[0.6, 0.8[','[0.8, 1[']
colors = ['#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf']
data = go.Pie(
values= district_df["pct_black/hispanic"].value_counts(),
labels= status,
marker=dict(colors=colors),
textinfo='label+value+percent'
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
status = ['[0.18, 1[','[1,2[']
colors = ['#17becf', '#E1396C']
data = go.Pie(
values= district_df["county_connections_ratio"].value_counts(),
labels= status,
marker=dict(colors=colors),
textinfo='label+value+percent'
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
district_df["pp_total_raw"].value_counts()

In [1]:
plt.figure(figsize=(10,12))
sns.countplot(x ='pp_total_raw',data = district_df,order=district_df['pp_total_raw'].value_counts().index)
plt.xticks(rotation=90)
plt.show()

In [1]:
status = ['[8000, 10000[','[10000, 12000[','[14000, 16000[','[12000, 14000[','[6000, 8000[','[16000, 18000[','[18000, 20000[','[20000, 22000[','[22000, 24000[','[4000, 6000[','[32000, 34000[']
colors = ['#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf','#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#E1396C']
data = go.Pie(
values= district_df["pp_total_raw"].value_counts(),
labels= status,
marker=dict(colors=colors),
textinfo='label+value+percent'
)
layout = go.Layout(
title=dict(text = "State",x=0.46,y=0.95,font_size=20)
)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [1]:
plt.figure(figsize=(20,40))
value_to_int = {j:i for i,j in enumerate(pd.unique(district_df.values.ravel()))} # like you did
n = len(value_to_int)     
# discrete colormap (n samples from a given cmap)
cmap = sns.color_palette("Pastel2", n) 
ax = sns.heatmap(district_df.replace(value_to_int), cmap=cmap) 
# modify colorbar:
colorbar = ax.collections[0].colorbar 
r = colorbar.vmax - colorbar.vmin 
colorbar.set_ticks([colorbar.vmin + r / n * (0.5 + i) for i in range(n)])
colorbar.set_ticklabels(list(value_to_int.keys()))                                          
plt.show()

In [1]:
states = district_df.groupby(by ='state').count()[['district_id']]
plt.figure(figsize=(15,10))
plt.title("States with most Districts Mentioned")
plt.ylabel('No of districts')
plt.xlabel('States')
sns.set(rc={"axes.facecolor":"#283747", "axes.grid":False,'xtick.labelsize':14,'ytick.labelsize':14})
sns.barplot(x=states.index,y=list(states['district_id']))
plt.xticks(rotation=90)

In [1]:
locale=district_df.groupby(by ='locale').count()[['district_id']]
plt.figure(figsize=(15,10))
plt.title("Locale with most Districts Mentioned")
plt.ylabel('No of districts')
plt.xlabel('No. of locale mentioned')
sns.set(rc={"axes.facecolor":"#283747", "axes.grid":False,'xtick.labelsize':14,'ytick.labelsize':14})
sns.barplot(x=locale.index,y=list(locale['district_id']))
plt.xticks(rotation=90)

In [1]:
black_hispanic=district_df.groupby(by ='pct_black/hispanic').count()[['district_id']]
plt.figure(figsize=(15,10))
plt.title("pct_black/hispanic with most Districts Mentioned")
plt.ylabel('No of districts')
plt.xlabel('No. of pct_black/hispanic mentioned')
sns.set(rc={"axes.facecolor":"#283747", "axes.grid":False,'xtick.labelsize':14,'ytick.labelsize':14})
sns.barplot(x=black_hispanic.index,y=list(black_hispanic['district_id']))
plt.xticks(rotation=90)

In [1]:
free_reduced=district_df.groupby(by ='pct_free/reduced').count()[['district_id']]
plt.figure(figsize=(15,10))
plt.title("pct_free/reduced with most Districts Mentioned")
plt.ylabel('No of districts')
plt.xlabel('No. of pct_black/hispanic mentioned')
sns.set(rc={"axes.facecolor":"#283747", "axes.grid":False,'xtick.labelsize':14,'ytick.labelsize':14})
sns.barplot(x=free_reduced.index,y=list(free_reduced['district_id']))
plt.xticks(rotation=90)

In [1]:
county_connection=district_df.groupby(by ='county_connections_ratio').count()[['district_id']]
plt.figure(figsize=(10,10))
plt.title("county_connections_ratio with most Districts Mentioned")
plt.ylabel('No of districts')
plt.xlabel('No. of county_connections_ratio mentioned')
sns.set(rc={"axes.facecolor":"#283747", "axes.grid":False,'xtick.labelsize':14,'ytick.labelsize':14})
sns.barplot(x=county_connection.index,y=list(county_connection['district_id']))
plt.xticks(rotation=90)

In [1]:
plt.figure(figsize=(10,10))
sns.set(rc={'xtick.labelsize':12,'ytick.labelsize':12,'axes.labelsize':12})
sns.swarmplot(x="state", y="pct_black/hispanic", hue="locale", data=district_df)
plt.xticks(rotation=90)

In [1]:
plt.figure(figsize=(10,10))
sns.displot(data=district_df, x='state', hue= 'locale', height=8, aspect=3)
plt.xticks(rotation=90)

In [1]:
pct_black_hispanic = district_df['pct_black/hispanic'].str.split(",",n=1,expand=True)
# separating pct_black and pct_hispanic
district_df['pct_black']=pct_black_hispanic[0].str.replace('[','',regex=True)
district_df['pct_hispanic']= pct_black_hispanic[1].str.replace('[','',regex=True)
# converting pct_black and pct_hispanic to numeric
district_df['pct_black']=pd.to_numeric(district_df['pct_black'])
district_df['pct_hispanic']=pd.to_numeric(district_df['pct_hispanic'])
district_df['pct_black_and_hispanic']=(district_df['pct_black'] + district_df['pct_hispanic'])/2

In [1]:
sns.displot(data=district_df, x='pct_black_and_hispanic', hue='locale',kind='kde',multiple="stack",height=8.27, aspect=11.7/8.27)

In [1]:
sns.displot(data=district_df, x="pct_black_and_hispanic", hue='state', height=8.27, aspect=11.7/8.27)

In [1]:
plt.figure(figsize=(10,10))
sns.countplot(district_df['pct_hispanic'],hue=district_df["locale"])
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize=(10,10))
sns.countplot(district_df['pct_black'],hue=district_df["locale"])
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize = (20,15))
ax=sns.countplot(x = "pct_hispanic", data = district_df, hue="state")
ax.set_title("Number of pct_hispanic based on state", fontsize = 20)
plt.xlabel("pct_hispanic",fontsize=17)
plt.ylabel("count", fontsize=17)
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize = (20,15))
ax=sns.countplot(x = "pct_black", data = district_df, hue="state")
ax.set_title("Number of pct_black based on state", fontsize = 20)
plt.xlabel("pct_black",fontsize=17)
plt.ylabel("count", fontsize=17)
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
pct_free_reduced = district_df['pct_free/reduced'].str.split(",",n=1,expand=True)
# extract pct_free and pct_reduced
district_df['pct_free']=pct_free_reduced[0].str.replace('[','',regex=True)
district_df['pct_reduced']= pct_free_reduced[1].str.replace('[','',regex=True)
# convert pct_free and pct_reduced to numeric
district_df['pct_free']=pd.to_numeric(district_df['pct_free'])
district_df['pct_reduced']=pd.to_numeric(district_df['pct_reduced'])

district_df['pct_free'].fillna(district_df['pct_free'].median(), inplace=True)
district_df['pct_reduced'].fillna(district_df['pct_reduced'].median(), inplace=True)
#combine pct_free and pct_reduced with mean value
district_df['pct_free_and_reduced']=(district_df['pct_free'] + district_df['pct_reduced'])/2

In [1]:
sns.displot(data=district_df, x='pct_free_and_reduced', hue='locale',kind='kde',multiple="stack",height=8.27, aspect=11.7/8.27)

In [1]:
sns.displot(data=district_df, x="pct_free_and_reduced", hue='state', height=8.27, aspect=11.7/8.27)

In [1]:
plt.figure(figsize=(10,10))
sns.countplot(district_df['pct_free'],hue=district_df["locale"])
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize=(10,10))
sns.countplot(district_df['pct_reduced'],hue=district_df["locale"])
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize = (20,15))
ax=sns.countplot(x = "pct_free", data = district_df, hue="state")
ax.set_title("Number of pct_free based on state", fontsize = 20)
plt.xlabel("pct_free",fontsize=17)
plt.ylabel("count", fontsize=17)
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
plt.figure(figsize = (20,15))
ax=sns.countplot(x = "pct_reduced", data = district_df, hue="state")
ax.set_title("Number of pct_reduced based on state", fontsize = 20)
plt.xlabel("pct_reduced",fontsize=17)
plt.ylabel("count", fontsize=17)
plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()

In [1]:
fig = ff.create_table(engagement_df.head(5),height_constant=50)
fig.update_layout(width=1000, height=400)
fig.show()

In [1]:
colorscale = [[0, 'magenta'],[.5, '#DCE775'],[1, '#C0CA33']]
fig = ff.create_table(engagement_df.tail(5),height_constant=50,colorscale=colorscale)
fig.update_layout(width=1000, height=400)
fig.show()