In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Read all  csv file in the directory engagement_data and transform into a npy file

In [None]:
eg_path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/"
path = "../input/learnplatform-covid19-impact-on-digital-learning/"
files = os.listdir(eg_path)
x = np.array([[None,None,None,None,None]]) #initial first row

for file in files:
    print(file," ",end="")
    _df = pd.read_csv(eg_path +file)
    a, b = file.split('.')
    _df['district_id'] = a
    y = np.array(_df)
    x = np.append(x,y,axis=0)
x = np.delete(x,0,axis=0) #delete first row
np.save("eg.npy",x)



## Load eg.npy to DataFrame

In [None]:
#pct_access         Percentage of students in the district have at least one page-load event of a given product and on a given day
#engagement_index   Total page-load events per one thousand students of a given product and on a given day
tmp = np.load("eg.npy",allow_pickle = True)
eg_df = pd.DataFrame(tmp, columns = ['time','lp_id','pct_access','engagement_index','district_id'])
print(eg_df.dtypes)
print(eg_df.columns)
print(eg_df.shape)
eg_df

## Import districts_info.csv and products_info.csv to DataFrame

In [None]:
d_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
print(d_df.dtypes)
print(d_df.columns)
print(d_df.shape)
d_df

In [None]:
p_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
print(p_df.dtypes)
print(p_df.columns)
print(p_df.shape)
p_df

## Check for Nan

In [None]:
print(eg_df.isnull().sum())
print(d_df.isnull().sum())
print(p_df.isnull().sum())

## Check for duplicated

In [None]:
print(eg_df.duplicated(subset=['time','lp_id','district_id']).sum())
print(d_df.duplicated(subset=['district_id']).sum())
print(p_df.duplicated(subset=['LP ID']).sum())

## Drop Nan

In [None]:
print(eg_df.shape)
eg_df = eg_df.dropna(axis=0,subset=['engagement_index'])
eg_df = eg_df.dropna(axis=0,subset=['pct_access'])
eg_df = eg_df.dropna(axis=0,subset=['lp_id'])
print(eg_df.shape)

## Group by ['lp_id'],and save the result to file lp_id.csv.

In [None]:
df = eg_df.drop(['time','district_id'],axis = 1)
tmp = df.groupby(['lp_id']).sum()
tmp['pct_access'] = tmp['pct_access']/366  #the year 2020 has 366 days
tmp['engagement_index'] = tmp['engagement_index']/366
tmp.to_csv('lp_id.csv')

## Find the top 20 product

In [None]:
lpid_df = pd.read_csv('lp_id.csv') # lp_id, pct_access, engagement_index
lp = lpid_df.sort_values(by=['pct_access'], ascending=False)[:20]
print(lp)
eg = lpid_df.sort_values(by=['engagement_index'], ascending=False)[:20]
print(eg)

## In the file products_info.csv ,lp_id 26488 is missing. Append one row for lp_id 26488.

In [None]:
columns = ["LP ID","URL","Product Name","Provider/Company Name","Sector(s)","Primary Essential Function"]
row = pd.DataFrame([(26488,"Unknown","Unknown","Unknown","Unknown","Unknown")],columns = columns)
p_df2 = pd.concat([p_df, row],axis=0)

## find the product name

In [None]:
lpid_df = pd.read_csv('lp_id.csv')  # lp_id, pct_access, engagement_index
lpid_df['pct_access'] = lpid_df['pct_access']/233  # 233 is the number of district
lpid_df['engagement_index'] = lpid_df['engagement_index']/233
pc = lpid_df.sort_values(by=['pct_access'], ascending=False)[:20]
eg = lpid_df.sort_values(by=['engagement_index'], ascending=False)[:20]
pct = pd.merge(pc, p_df2, left_on="lp_id", right_on="LP ID").drop('LP ID',axis=1)
print(pct.head())
eng = pd.merge(eg, p_df2, left_on="lp_id", right_on="LP ID").drop('LP ID',axis=1)
print(eng.head())

## Top 20 Product

In [None]:
sns.set_palette("bright")
plt.figure(figsize=(15,10))
sns.barplot(pct['pct_access'],pct['Product Name'] )
plt.xlabel('Average percentage of students have at least one page-load event of a given product and on a day')
plt.ylabel('Product Name')

In [None]:
sns.set_palette("bright")
plt.figure(figsize=(15,10))
sns.barplot(eng['engagement_index'],eng['Product Name'] )
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
plt.ylabel('Product Name')

## Group by ['district_id'] ,and save the result to file district_id.csv

In [None]:
df = eg_df.drop(['time','lp_id'],axis = 1)
tmp = df.groupby(['district_id']).sum()
tmp['pct_access'] = tmp['pct_access']/366
tmp['engagement_index'] = tmp['engagement_index']/366
tmp.to_csv('district_id.csv')

## Group by state name

In [None]:
dis_df = pd.read_csv('district_id.csv') # district_id, pct_access, engagement_index
pct = pd.merge(dis_df, d_df, left_on="district_id", right_on="district_id").drop('district_id',axis=1)
print(pct.isnull().sum())
pct.dropna(subset=['state'],inplace=True)
df = pct.drop(['locale','pct_black/hispanic','pct_free/reduced','county_connections_ratio','pp_total_raw'],axis = 1)
df.groupby(['state']).mean().to_csv('state.csv')  #mean() => divide the number of the district in the state

In [None]:
dis_df = pd.read_csv('state.csv') # state, pct_access, engagement_index
lp = dis_df.sort_values(by=['pct_access'], ascending=False)
print(lp)
eg = dis_df.sort_values(by=['engagement_index'], ascending=False)
print(eg)

## Relation between state  and pct_access

In [None]:
sns.set_palette("bright")
plt.figure(figsize=(15,10))
sns.barplot(lp['pct_access'],lp['state'] )
plt.xlabel('average percentage of students have at least one page-load event of all product and on a day')
plt.ylabel('State')

## Relation between state  and engagement_index

In [None]:
sns.set_palette("bright")
plt.figure(figsize=(15,10))
sns.barplot(eg['engagement_index'],eg['state'] )
plt.ylabel('State')
plt.xlabel('Average total page-load events per one thousand students of all product and on a day')

## Group by ['district_id','lp_id'] ,and save the result to file re.csv.

In [None]:
df = eg_df.drop(['time'],axis = 1)
tmp = df.groupby(['district_id','lp_id']).sum()
tmp['pct_access'] = tmp['pct_access']/366  #the year 2020 has 366 days
tmp['engagement_index'] = tmp['engagement_index']/366
tmp.to_csv('re.csv')

## Find the statistics between the state and product,and save the result to file re1.csv.

In [None]:
re = pd.read_csv('re.csv')  #  district_id, lp_id, pct_access, engagement_index
print(re.shape)
re = pd.merge(re, p_df2, left_on="lp_id", right_on="LP ID").drop('LP ID',axis=1)
re = re[['district_id','pct_access','engagement_index','Product Name']]
print(re.shape)
dis_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
re = pd.merge(dis_df, re, left_on="district_id", right_on="district_id").drop('district_id',axis=1)
re = re[['state','pct_access','engagement_index','Product Name']]
re.groupby(['state','Product Name']).mean().to_csv('re1.csv') #mean() => divide the number of the district in the state

##  Product in the Arizona

In [None]:
re1 = pd.read_csv('re1.csv') #state, Product Name, pct_access, engagement_index
p = re1[re1['state']=='Arizona'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title("Arizona")
plt.ylabel('Product Name')
plt.xlabel('Average total page-load events per one thousand students of a given product  and on a day')

## Product in the Illinois

In [None]:
re1 = pd.read_csv('re1.csv') #state, Product Name, pct_access, engagement_index
p = re1[re1['state']=='Illinois'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title("Illinois")
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
plt.ylabel('Product Name')

## Google Docs 

In [None]:
re1 = pd.read_csv('re1.csv') #state, Product Name, pct_access, engagement_index
p = re1[re1['Product Name']=='Google Docs'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['state'] )
plt.title('Google Docs')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
plt.ylabel('State')

## 

## Google Classroom

In [None]:
re1 = pd.read_csv('re1.csv') #state, Product Name, pct_access, engagement_index
p = re1[re1['Product Name']=='Google Classroom'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['state'] )
plt.title('Google Classroom')
plt.ylabel('State')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')

## Find the statistics between the locale and software usage

In [None]:
dis_df = pd.read_csv('district_id.csv') # district_id, pct_access, engagement_index
d_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
pct = pd.merge(dis_df, d_df, left_on="district_id", right_on="district_id").drop('district_id',axis=1)
pct.dropna(subset=['locale'],inplace=True)
df = pct.drop(['state','pct_black/hispanic','pct_free/reduced','county_connections_ratio','pp_total_raw'],axis = 1)
df.groupby(['locale']).mean().to_csv('locale.csv') #mean() => divide the number of the district in the same locale

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(df['locale'], df['pct_access'])
plt.figure(figsize=(15,5))
sns.barplot(df['locale'], df['engagement_index'])

In [None]:
re = pd.read_csv('re.csv')  #  district_id, lp_id, pct_access, engagement_index
re = pd.merge(re, p_df2, left_on="lp_id", right_on="LP ID").drop('LP ID',axis=1)
re = re[['district_id','pct_access','engagement_index','Product Name']]
dis_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
re = pd.merge(dis_df, re, left_on="district_id", right_on="district_id").drop('district_id',axis=1)
re.dropna(subset=['locale'],inplace=True)
re = re[['locale','pct_access','engagement_index','Product Name']]
print(re)
re.groupby(['locale','Product Name']).sum().to_csv('re2.csv')

## Top 20 product name in the locale

In [None]:
re2 = pd.read_csv('re2.csv') #locale, Product Name, pct_access, engagement_index
p = re2[re2['locale']=='Suburb'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title('Suburb')
plt.ylabel('Product Name')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
p = re2[re2['locale']=='Rural'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title('Rural')
plt.ylabel('Product Name')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
p = re2[re2['locale']=='City'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title('City')
plt.ylabel('Product Name')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')
p = re2[re2['locale']=='Town'].sort_values(by=['engagement_index'], ascending=False)[:20]
plt.figure(figsize=(15,10))
sns.barplot(p['engagement_index'],p['Product Name'] )
plt.title('Town')
plt.ylabel('Product Name')
plt.xlabel('Average total page-load events per one thousand students of a given product and on a day')