In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn import datasets,linear_model

# 区分训练集和测试集
from sklearn.model_selection import train_test_split
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## import engagement data, district data,production data

In [None]:

engagement_data = []
base_path = "/kaggle/input/learnplatform-covid19-impact-on-digital-learning"
engagement_path = os.path.join(base_path, 'engagement_data')


for root,dirs,files in os.walk(engagement_path):
    for file in files:
        path = os.path.join(engagement_path, file)
        district_id, ext = os.path.splitext(file)
        df = pd.read_csv(path)
        df["district_id"] = district_id
#         print(os.path.join(base_path, file))
        engagement_data.append(df)
        
        
engagement_data = pd.concat(engagement_data)
engagement_data.head()

In [None]:
dis_path = os.path.join(base_path, 'districts_info.csv')


districts_info = pd.read_csv(dis_path)
districts_info.head()

In [None]:
pro_path = os.path.join(base_path, 'products_info.csv')


products_info = pd.read_csv(pro_path)
products_info.head()

## Distribution of districts

In [None]:
districts_info = districts_info.dropna()

districts_info_state = districts_info.groupby('state')["district_id"].count()

# districts_info_state

plt.style.use('ggplot')
districts_info_state.plot.pie(autopct='%.2f%%',figsize=(10,10))

# state_fig = px.pie(districts_info, names='state')
# state_fig.show()



## Distribution of sector

In [None]:
products_info_sector = products_info.groupby("Sector(s)")["Sector(s)"].count()
products_info_sector.plot.pie(figsize=(8,8),autopct='%.2f%%')

# state_fig = px.pie(districts_info, names='locale')
# state_fig.show()

## Top 20 product by pct_access mean

In [None]:
lp_id_pct_access = engagement_data.groupby('lp_id')['pct_access'].mean().sort_values(ascending=False).head(20)

# series转df
lp_id_pct_access_df = lp_id_pct_access.reset_index(level=None, drop=False, name=None, inplace=False)

# 参与度数据与产品表关联
lp_id_pct_access_product = pd.merge(lp_id_pct_access_df, products_info, how='inner',left_on='lp_id',right_on='LP ID')

lp_id_pct_access_product[['Product Name','pct_access']].set_index('Product Name').plot(kind='bar',figsize=(10,10))
plt.xticks(rotation=60)

## Top 20 product by engagement_index mean

In [None]:
engagement_index_mean = engagement_data.groupby('lp_id')['engagement_index'].mean().sort_values(ascending=False).head(20)

engagement_index_mean_df = engagement_index_mean.reset_index()

engagement_index_mean_product_df = pd.merge(engagement_index_mean_df, products_info, how='inner', left_on='lp_id', right_on='LP ID')

# engagement_index_mean_product_df
engagement_index_mean_product_df[['Product Name','engagement_index']].set_index('Product Name').plot(kind='bar',figsize=(10,10))

# 可以算一个长尾分布
plt.xticks(rotation=45)

## mean of engagement data per month

In [None]:
# get month
def get_month(time):
    return time.split('-')[1]


engagement_data['month'] = engagement_data.loc[:,'time'].apply(get_month)

engagement_data_month = engagement_data.groupby('month',as_index=False).mean()

sns.set(style="darkgrid")

sns.relplot(data=engagement_data_month,x='month',y='engagement_index', kind="line",markers=True)


#### Engagement_index is constantly rising since January

#### It reached the highest point in April and September and reached the lowest point in July

## mean of percentage access per month

In [None]:
sns.relplot(data=engagement_data_month,x='month',y='pct_access', kind="line",markers=True)

#### Percentage access is constantly descending since January

#### It reached the lowest point in July and reached the highest point in September

#### Epidemic outbreak impact Percentage access descending

## explore relationship between pct_access and engagement_index

In [None]:
data = engagement_data.dropna()
X,y = data[['pct_access']].values,data[['engagement_index']]

# 区分测试集和训练集

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)
regr = linear_model.LinearRegression()
# 训练模型
regr.fit(X_train, y_train)
# 获取评分，结果是决定系数R平方
regr.score(X_train, y_train)

font = {'family':'FangSong', 'size':20}
plt.figure(figsize=(20,5))
plt.rc('font',**font)

# 训练数据
plt.scatter(X_train, y_train, color='r')

# 拟合线
plt.plot(X_train, regr.predict(X_train), color='b')

# 测试数据
# plt.scatter(X_test, y_test, color='black')

# 测试集预测
# plt.plot(X_test, regr.predict(X_test), 'y')

plt.xlabel('pct_access')
plt.ylabel('engagement_index')
plt.show()