In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import necessary package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
districts_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

## Districts data view

| Name | Description |
| :--- | :---|
| Districts_id | The unique identifier of the school district|
| state | The state where the district resides in |
| locale | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural |
| pct_black/hispanic | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data |
| pct_free/reduced | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data |
| county_connections_ratio | ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information. |
| pp_total_raw | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district|

In [None]:
districts_df.head()

In [None]:
districts_df.shape

In [None]:
districts_df.info()

### I want to change the values of the two columns('pct_black/hispanic' and 'pct_free/reduced') to make it easier to read 

In [None]:
#fix the percentage columns problem
columns = ['pct_black/hispanic','pct_free/reduced']
ranges = [-0.1,0.1,0.3,0.5,0.7,0.9]
group_name = ['0%-20%','20%-40%','40%-60%','60%-80%','80%-100%']
for c in columns:
    districts_df[c] = districts_df[c].str.replace('[','')
    df = districts_df[c].str.split(',', expand=True)
    df.columns = ['Min','Max']
    df = df.astype('float')
    districts_df[c+'_range'] = pd.cut(df['Min'], bins=ranges, labels=group_name)
districts_df.drop(columns,axis=1,inplace=True)

In [None]:
districts_df.head()

## EDA in districts data

In [None]:
#group by state
plt.figure(figsize=(10,10))
_ = sns.countplot(y='state', data=districts_df, order=districts_df.state.value_counts().index)
plt.xlabel('count of number')
plt.title('The number of Districts group by state',fontsize=20)

In [None]:
#group by locale
group = districts_df.groupby('locale').count()
_ = sns.barplot(x=group.index, y=group.district_id)
plt.ylabel('count of number')
plt.title('The number of Districts group by locale',fontsize=12)

## products data view

|Name|Destricption|
|:--- |:--- |
| LP ID | The unique identifier of the product |
|URL | Web Link to the specific product |
| Product Name | Name of the specific product |
| Provider/Company Name | Name of the product provider |
| Sector(s) | Sector of education where the product is used |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled |

In [None]:
products_df.head()

### Split 'Primary Essential Function' to 'Category' and 'Sub-Category'

In [None]:
df = products_df['Primary Essential Function'].str.split('-',expand=True)
df[0].replace('LC/CM/SDO ','Others',inplace=True)
products_df['Category'] = df[0]
products_df['Sub-Category'] = [v[1]+'-'+v[2] if (pd.isnull(v[2])!=True) else v[1] for v in df.values]
products_df.drop('Primary Essential Function',axis=1,inplace=True)

In [None]:
products_df.head()

In [None]:
products_df.shape

In [None]:
products_df.info()

In [None]:
products_df.nunique()

## Engagement data view

|Name|Destricption|
|:---|:---|
| time | date in "YYYY-MM-DD" | 
| lp_id | The unique identifier of the product |
| pct_access | Percentage of students in the district have at least one page-load event of a given product and on a given day |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day |

In [None]:
Engagement_df = pd.DataFrame()
for d in districts_df.district_id:
  df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'+str(d)+'.csv')
  df['district_id'] = d    
  Engagement_df = pd.concat([Engagement_df,df])

In [None]:
Engagement_df.head()

In [None]:
Engagement_df.shape

In [None]:
Engagement_df.info()

In [None]:
#change data type
Engagement_df['time'] = pd.to_datetime(Engagement_df['time'])
Engagement_df['month'] = Engagement_df['time'].dt.month
_ = sns.countplot(x='month', data=Engagement_df)

In [None]:
#combine three tables together
Engagement_df.dropna(subset=['lp_id'], inplace=True)
Engagement_df['lp_id'] = Engagement_df['lp_id'].astype('int')
full_data = pd.merge(Engagement_df,districts_df,how='left',on='district_id')
full_data = pd.merge(full_data,products_df,how='left',left_on='lp_id',right_on='LP ID')

In [None]:
full_data.head()

## What is the picture of digital connectivity and engagement in 2020?

In [None]:
#datetime plot
df = Engagement_df[['month','lp_id']].groupby('month').count()
_ = sns.lineplot(x=df.index, y=df.lp_id, marker='.', linestyle=None)
plt.xticks(range(1,13))
plt.ylabel('count of number(million)')

In [None]:
#group by state
_ = sns.countplot(y='state', data=full_data, order = full_data['state'].value_counts().index)
plt.xlabel('count of number (million)')

In [None]:
#group by locale
df = full_data[['locale','engagement_index']].groupby('locale').sum().sort_values(by='engagement_index',ascending=False)
colors = sns.color_palette('pastel')[0:4]
_ = plt.pie(df.engagement_index, labels=df.index, colors=colors, autopct='%.0f%%')
plt.title('percentage of locale',fontsize=15)

In [None]:
#group by Provider/Company Name
df = full_data[['Provider/Company Name','lp_id']].groupby('Provider/Company Name').count().sort_values(by='lp_id',ascending=False)
df = df.iloc[:10]
_ = sns.barplot(x=df.lp_id,y=df.index)
plt.xlabel('count of number(million)')

### How does student engagement with online learning platforms relate to different geography? Demographic context 
### (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

In [None]:
_ = sns.countplot(x='pct_black/hispanic_range', data=full_data, hue='pct_free/reduced_range')
plt.xlabel('Percentage Range')
plt.ylabel('count of number(million)')
plt.title('Percentage of Black/Hispanic')
plt.legend(loc='upper right',frameon=False)