# 190030265 Kalyan
‘LearnPlatform’ COVID-19 Impact on Digital Learning

In [None]:
# Import Library 
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import glob

In [None]:
# readDataset district and product
dt_district = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
dt_product = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
#read dataset engagement and name it based on district to easy analyse
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0,dtype={'lp_id':str})
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
dt_engagement = pd.concat(li)
dt_engagement = dt_engagement.reset_index(drop=True)
dt_engagement.head()

In [None]:
# getting a few of data in district_info
dt_district.head()

In [None]:
#find information about dataset district_info 
dt_district.info()

In [None]:
# find the record in disctrict_info dataset
index = dt_district.index
record_total = len(index)
print("Total Record in disctrict_info",record_total)

In [None]:
# Check Statistict Description
dt_district.describe(include='all')

In [None]:
# Check Missing Value in district_info
dt_district.isna().sum()

In [None]:
#check column that contain missing value for all six column 
null_data = dt_district.loc[dt_district.isnull().sum(1)>5].index
dt_district.loc[null_data]

In [None]:
#Total Data that contain Missing Value for All Six Columns
print('Total Data that Contains Missing Value from district_info = ',len(dt_district.loc[null_data]))

In [None]:
# Because there are missing values in the dataset so we need to drop that to make more accurate analyst 
# dropping Missing Value for all six Columns to easy doing analysis
dt_district.dropna(thresh=6,inplace=True)

In [None]:
#Check are the missing value already gone
dt_district.isna().sum()

In [None]:
#check data district after drop missing value
dt_district.head()

In [None]:
#Dropping if there are a duplicates data to easy analyse
dt_district = dt_district.drop_duplicates()

In [None]:
# Explore the State Distribution
dt_district['state'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
dt_district['state'].value_counts().plot(kind='barh')
plt.title('Count Distribution for State')
plt.xlabel('count')
plt.ylabel('State')

In [None]:
# Pie Chart Visualisation of State
dt_district["state"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(10, 10)).legend()
plt.title('Pie Chart State')

In [None]:
#count locale data in district
dt_district['locale'].value_counts()

In [None]:
#visualisation count of locale in district dataset
plt.figure(figsize=(10,10))
dt_district['locale'].value_counts().plot(kind='barh')
plt.title('Count of Locale')
plt.xlabel('Count')
plt.ylabel('Locale')

In [None]:
# pie chart visualisation of percentage locale
plt.title('Plot Percentage Distribution Locale')
dt_district['locale'].value_counts().plot(kind='pie', autopct='%3.1f%%',figsize=(10, 10)).legend()

In [None]:
# show a few data product_info
dt_product.head()

In [None]:
# gain information from product info
dt_product.info()

In [None]:
# find length of product info dataset
index = dt_product.index
record_total = len(index)
print("Total Record in product info",record_total)

In [None]:
# Check Statistict Description
dt_product.describe(include='all')

In [None]:
# Check Missing Value in product info
dt_product.isna().sum()

In [None]:
#Dropping if there are a duplicates data to easy analyse
dt_product = dt_product.drop_duplicates()

In [None]:
plt.title('Distribution of Sector(s) in the District Information Data')
dt_product["Sector(s)"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(10, 10)).legend()

In [None]:
#show a few of data from Engagement_Data
dt_engagement.head()

In [None]:
#show information about Engagement Data
dt_engagement.info()

In [None]:
#dropping data that contain lp_id null 
dt_engagement = dt_engagement.drop(dt_engagement.loc[dt_engagement['lp_id'].isnull()].index)
dt_engagement = dt_engagement.fillna(0.0)

In [None]:
# set lp_id and disctrict_id to int to merge with product_info and state_info
dt_engagement["lp_id"] = dt_engagement["lp_id"].astype(int)
dt_engagement["district_id"] = dt_engagement["district_id"].astype(int)
#rename column dt_product to easy merge
dt_product.rename(columns = {'LP ID': 'lp_id'}, inplace = True)

In [None]:
# merge districts and products
dt_explore = pd.merge(dt_engagement, dt_district, on="district_id")
dt_explore = pd.merge(dt_explore, dt_product, on="lp_id")
dt_explore

In [None]:
# gain information from the combine dataset 
dt_explore.info()

In [None]:
# change the Dtype of time because previously it was an object
dt_explore['time']= pd.to_datetime(dt_explore['time'])

In [None]:
##GroupByDay
df_groupbydays = dt_explore.set_index('time').groupby(pd.Grouper(freq='D')).mean()

In [None]:
#show the data of groupbyday
df_groupbydays

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})
sns.lineplot(x = "time", y = "pct_access", data = df_groupbydays,marker='o')

In [None]:
sns.lineplot(x = "time", y = "engagement_index", data = df_groupbydays,marker='o')

In [None]:
dt_month = dt_explore.groupby(pd.Grouper(key='time', axis=0, 
                      freq='M')).mean()

In [None]:
sns.lineplot(x = "time", y = "engagement_index", data = dt_month,marker='o')

In [None]:
sns.lineplot(x = "time", y = "pct_access", data = dt_month,marker='o')

In [None]:
dt_explore.head()

In [None]:
best_product = dt_explore.groupby(by = 'Product Name', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='Product Name', data = best_product[0:15])
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'Product Name')
plt.title(label = 'Best 15 Product Name That Used')

In [None]:
best_sectors = dt_explore.groupby(by = 'Sector(s)', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='Sector(s)', data = best_sectors)
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'Sectors')
plt.title(label = 'Top Sectors')

In [None]:
best_company = dt_explore.groupby(by = 'Provider/Company Name', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='Provider/Company Name', data = best_company[0:5])
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'Provider/Company Name')
plt.title(label = 'Best 5 Provider/Company')

In [None]:
best_primary = dt_explore.groupby(by = 'Primary Essential Function', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='Primary Essential Function', data = best_primary[0:10])
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'Primary Essential Function')
plt.title(label = 'Best 10 Primary Essential Function')

In [None]:
dt_state = dt_explore.groupby(by = 'state', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='state', data = dt_state)
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'State')
plt.title(label = 'Engagement Index based on State')

In [None]:
dt_locale = dt_explore.groupby(by = 'locale', as_index = False)['engagement_index'].agg('mean').sort_values(by ='engagement_index', ascending = False)
sns.barplot(x = 'engagement_index', y ='locale', data = dt_locale)
plt.xlabel(xlabel = 'Engagement Index')
plt.ylabel(ylabel = 'Locale')
plt.title(label = 'Engagement Index based on Locale')

In [None]:
dt_rural =  dt_explore[dt_explore["locale"] == 'Rural']
dt_rural.head()

In [None]:
rural_engagement = dt_rural.groupby(["locale", "time"],as_index=False)["engagement_index"].mean().reset_index(drop=True)
rural_engagement.head()

In [None]:
sns.lineplot(x = 'time',y='engagement_index',data=rural_engagement)
plt.title(label = 'Engagement Index based from Rural')

In [None]:
dt_suburb =  dt_explore[dt_explore["locale"] == 'Suburb']
dt_suburb.head()

In [None]:
suburb_engagement = dt_suburb.groupby(["locale", "time"],as_index=False)["engagement_index"].mean().reset_index(drop=True)
suburb_engagement.head()

In [None]:
sns.lineplot(x = 'time',y='engagement_index',data=suburb_engagement)
plt.title(label = 'Engagement Index based from Suburb')

In [None]:
dt_town =  dt_explore[dt_explore["locale"] == 'Town']
dt_town.head()

In [None]:
town_engagement = dt_town.groupby(["locale", "time"],as_index=False)["engagement_index"].mean().reset_index(drop=True)
town_engagement.head()

In [None]:
sns.lineplot(x = 'time',y='engagement_index',data=town_engagement)
plt.title(label = 'Engagement Index based from Town')

In [None]:
dt_city =  dt_explore[dt_explore["locale"] == 'City']
dt_city.head()

In [None]:
city_engagement = dt_city.groupby(["locale", "time"],as_index=False)["engagement_index"].mean().reset_index(drop=True)
city_engagement.head()

In [None]:
sns.lineplot(x = 'time',y='engagement_index',data=city_engagement)
plt.title(label = 'Engagement Index based from City')

In [None]:
sns.lineplot(x = 'time',y='engagement_index',data=rural_engagement,color='blue')
sns.lineplot(x = 'time',y='engagement_index',data=suburb_engagement,color='green')
sns.lineplot(x = 'time',y='engagement_index',data=city_engagement,color='red')
sns.lineplot(x = 'time',y='engagement_index',data=town_engagement,color='orange')
plt.legend(labels=['Rural', 'Suburb', 'City','Town'])
plt.title(label = 'Engagement Index based groupby Locale')