In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import string as str
warnings.filterwarnings('ignore')

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

We start by looking into products_info.csv and analyse different products available in the market.
As per the overview about the file:

The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

* Name-----Description
* LP ID------The unique identifier of the product
* URL-------Web Link to the specific product
* Product Name------Name of the specific product
* Provider/Company Name------Name of the product provider
* Sector(s)------Sector of education where the product is used
* Primary Essential Function------The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: 
* LC = Learning & Curriculum, 
* CM = Classroom Management, 
* SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled

In [None]:
products_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
products_info.head()

In [None]:
products_info.info()

In [None]:
products_info.isna().sum()

In [None]:
products_info['Sector(s)'].unique()

In [None]:
# out of 372 entries, 20 entries are missing for sectors and PEF.which we can ignore
sectors = products_info.groupby('Sector(s)')[['LP ID']].count()
sectors

In [None]:
# draw the output of count of sectors and pef

def plotcount(data):
    plt.figure(figsize=(12,6))
    ax = sns.countplot(x=data, order=data.value_counts().index)
    for item in ax.get_xticklabels(): item.set_rotation(90)
    for p in ax.patches:
        # ha = 'center', 'right', 'left'
        #va = 'top', 'bottom', 'center', 'baseline', 'center_baseline'
        ax.annotate(f'\n{p.get_height()}', (p.get_x() + 0.2, p.get_height()), color='white', size=18, ha='center', va='center')            

In [None]:
plotcount(products_info['Sector(s)'])

****FINDINGS: ******

In general PreK-12 sectors are the most important sectors to use the digital solution

In general, what is PreK-12
The PreK-12 initiative works to ensure that all children attending public elementary and secondary schools have access to and receive high-quality educational experiences, with a particular emphasis on improving equity and outcomes for traditionally underserved students.

K to 12 (also K-12) is an education system under the Department of Education that aims to enhance learners' basic skills, produce more competent citizens, and prepare graduates for lifelong xjobs.org/ learning and employment.

In [None]:
#split the Primary Essential Function column
#pef = pd.DataFrame()
products_info[['pef_cat']] = products_info['Primary Essential Function'].str.split(' - ', expand=True,)[0]
products_info[['pef']] = products_info['Primary Essential Function'].str.split(' - ', expand=True,)[1]
products_info.head()

LC = Learning & Curriculum,
CM = Classroom Management,
SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled

In [None]:
# plot the count plot for pef_cat
plotcount(products_info['pef_cat'])

In [None]:
# draw a pie chart for Primaray essential functions
pef = pd.DataFrame()

pef['value_count'] = products_info.groupby('pef')['LP ID'].count()
pef = pef.reset_index()

# pie chart for pef
plt.figure(figsize=(10,10))
plt.pie(pef['value_count'], labels=pef['pef'],autopct='%1.1f%%' )
plt.title('Different learning platforms')
plt.show()

In [None]:
# plot pef for each sector
pef_sector = pd.DataFrame()
pef_sector = products_info.groupby(['Sector(s)', 'pef_cat'])['LP ID'].count()
pef_sector = pef_sector.reset_index()

fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(8,10))

for cat,ax in zip(pef_sector['Sector(s)'].unique(), axes.flat):
    sns.barplot(data=pef_sector[pef_sector['Sector(s)'] == cat], x='pef_cat',y='LP ID', ax=ax).set_title(cat)

plt.show()

**OBSERVATION**:
* Corporate sector has SDO as primary elementary function 
* Prek-12 sector has LC (Learning curriculum) as primary elementary function.

In [None]:
# plot pef and pef sub categories of products
pef_cat_subcat = pd.DataFrame()
pef_cat_subcat = products_info.groupby(['pef_cat', 'pef'])['LP ID'].count().reset_index()
pef_cat_subcat = pef_cat_subcat.sort_values(['LP ID'], ascending=False)

fig, axes = plt.subplots(ncols=1, nrows=4, figsize=(8,20))

for cat,ax in zip(pef_cat_subcat['pef_cat'].unique(), axes.flat):
    sns.barplot(data=pef_cat_subcat[pef_cat_subcat['pef_cat'] == cat], y='pef',x='LP ID', ax=ax).set_title(cat)
    #for item in ax.get_yticklabels(): item.set_rotation(20)   

plt.show()

**DISTRICT INFO**
* district_id ----- The unique identifier of the school district
* state ---- The state where the district resides in
* locale ----- NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information.
* pct_black/hispanic ----- Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data
* pct_free/reduced ----- Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data
* countyconnectionsratio ----- ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.
* pptotalraw ----- Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.

In [None]:
# district info
district_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
district_info.head()

In [None]:
district_info.info()

**from the above, lot of data is NAN, so will drop all the rows in which all columns are NAN except districtID (example: index: 0,3,4 etc)**

In [None]:
# here 6 is number of columns in which the value is NAN.so, it will delete all the rows which has 6 NAN values
district_info_mod = district_info.dropna(thresh=6)  
district_info_mod.head()

In [None]:
district_info_mod['pct_free/reduced']  = district_info_mod['pct_free/reduced'].fillna('0,0')
district_info_mod['county_connections_ratio'] = district_info_mod['county_connections_ratio'].fillna('0,0')
district_info_mod['pp_total_raw'] = district_info_mod['pp_total_raw'].fillna('0,0')

In [None]:
# different states        
plotcount(district_info_mod['state'])

**From the above graph, it shows that**
Informtion provided for Utah, Connecticut, Illinios, Massachusets, California are available more than other states

In [None]:
# split the numeric data and take the mean value for calculation instead of range for columns: pct_black/hispanic, pct_free/reduced
#county_connections_ratio, pp_total_raw
def splitValues(column):
    mean_val = []
    for val in column:
        value_1 = pd.to_numeric(val.strip('[]').split(',')[0],errors='coerce')
        value_2 = pd.to_numeric(val.strip('[]').split(',')[1],errors='coerce')
        mean_val.append((value_1 + value_2)/2)
    return mean_val

In [None]:
district_info_mod['black/hisp_pct_mean'] = splitValues(district_info_mod['pct_black/hispanic'])
district_info_mod['free/red_pct_mean'] = splitValues(district_info_mod['pct_free/reduced'])
district_info_mod['county_connec_ratio_mean'] = splitValues(district_info_mod['county_connections_ratio'])
district_info_mod['pp_total_raw_mean'] = splitValues(district_info_mod['pp_total_raw'])
district_info_mod.head()

In [None]:
def barplotforDistrict(column):
    dist_info_state_hisp_pct = district_info_mod[column].groupby(district_info_mod['state']).mean().reset_index()
    plt.figure(figsize=(12,6))
    ax  = sns.barplot(x=dist_info_state_hisp_pct['state'], y = dist_info_state_hisp_pct[column], 
            order=dist_info_state_hisp_pct.sort_values(column,ascending=False).state)
    plt.title('Plotting values at state level for ' + column, fontsize=15)    
    plt.xlabel('state', fontsize=12)
    plt.ylabel(column, fontsize=12)
    for item in ax.get_xticklabels(): item.set_rotation(90)
    plt.show()

In [None]:
barplotforDistrict('black/hisp_pct_mean')
barplotforDistrict('free/red_pct_mean')
barplotforDistrict('county_connec_ratio_mean')
barplotforDistrict('pp_total_raw_mean')

**From above graph, it is clear that:**

* District of Columbia has highest black/hispanic ratio and so is the maximum expenditure
* Minnesota has maximum number of students with free or reduced price. 
* North Dekota state has maximum county connection than any other states.

In [None]:
# merge engagement_data
path = '/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'
# merging the files
joined_files = os.path.join(path, "*.csv")
  
# A list of all joined files is returned
joined_list = glob.glob(joined_files)

# add district id coloumn from file name
engagement_files = []
for csv in joined_list:
    frame = pd.read_csv(csv)
    frame['district_id'] = os.path.basename(csv.split('.')[0])
    engagement_files.append(frame)

#Finally, the files are joined
engagement_df = pd.concat(engagement_files)

**ENGAGEMENT DATA**
* time --> date in "YYYY-MM-DD"
* lp_id --> The unique identifier of the product
* pct_access --> Percentage of students in the district have at least one page-load event of a given product and on a given day
* engagement_index --> Total page-load events per one thousand students of a given product and on a given day
* district_id --> which can be used to link to district information in district_info.csv

In [None]:
# list of unique district_ids
engagement_df.info()

In [None]:
engagement_df['district_id'] = engagement_df['district_id'].astype('int64')

In [None]:
engagement_df.head()

In [None]:
# clean up engagement df
# drop the rows where lp id is not available as it will not give any information about engagement of product
engagement_df['engagement_index'].fillna(0, inplace=True)
engagement_df['pct_access'].fillna(0, inplace=True)
engagement_df.dropna(inplace=True)

In [None]:
engagement_df.isna().sum()

In [None]:
# remove decimal place from lp_id column
engagement_df['lp_id'] = engagement_df['lp_id'].astype(int)
engagement_df['time'] = pd.to_datetime(engagement_df['time'])

In [None]:
engagement_df.info()

In [None]:
# which products are engaged most
df_prod_engagement = engagement_df['engagement_index'].groupby(engagement_df['lp_id']).mean().reset_index()
df_prod_eng_join = df_prod_engagement.merge(products_info, how='inner', left_on='lp_id', right_on='LP ID')
df_prod_eng_join.head()

In [None]:
# plot a graph to show relation between engagement index and different columns from products df
def prodEngagement(column):
    df = df_prod_eng_join['engagement_index'].groupby(df_prod_eng_join[column]).mean().reset_index()
    df = df.sort_values(['engagement_index'], ascending=False).reset_index()
    plt.figure(figsize=(12,6))
    ax  = sns.barplot(x=df[column], y = df['engagement_index']) 
    for item in ax.get_xticklabels(): item.set_rotation(90)
    for i, v in enumerate(df["engagement_index"].iteritems()):        
        ax.text(i ,v[1], "{:,}".format(v[1]), color='m', va ='bottom', rotation=45)
    plt.title('Plot for engagement index and ' + column)
plt.tight_layout()
plt.show()

In [None]:
prodEngagement('Sector(s)')
prodEngagement('pef_cat')
prodEngagement('pef')

**from the above graph, it shows that**
* As we see above the most engaged sector is: PReK-12, Higher Ed;Corporate, folllowed by PreK-12
* Most preferred and engaged primary essential function category is: SDO = School & District Operations
* Most preferred sub category of PEF (primary essential function) is Learning Management System, which falls in SDO category.

In [None]:
# check increasae/decrease in online engagement 
# for a particular lp id, , how many entries do we have
df_lp_id = df_prod_eng_join[['lp_id']].groupby(df_prod_eng_join['engagement_index']).mean().reset_index()
df_lp_id = df_lp_id.sort_values('engagement_index', ascending=False)

df_time_lp_id = pd.DataFrame()

# pick top 5 to see the trend of increase of decrease of online engagement

for id in df_lp_id['lp_id'][:5]:
    df = engagement_df[engagement_df.lp_id == id][['lp_id','time', 'engagement_index']]
    plt.figure(figsize=(12,5))
    sns.kdeplot(data=df, x=df.time, y=df.engagement_index)
    prod_name = df_prod_eng_join[df_prod_eng_join.lp_id == id]['Product Name'].unique()
    plt.title('Plot for the product:  '+ prod_name[0])
    plt.show()    

**From the above graph, it shows that:**
* The top 5 most important education technologies that have maximum engagement are: Google docs, google classroom, youtube, canvas and meet.
* The online classroom trend to have increased during peak time for corona - around March 2020 and november 2020. Rest other times, engagement with digital learning is low.
* In year 2021, the digital learning engagegment is very low.

In [None]:
engagement_df.info()

In [None]:
# Join engagement and district dataframes
df_dist_engagement = engagement_df['engagement_index'].groupby(engagement_df['district_id']).mean().reset_index()
df_dist_eng_join = df_dist_engagement.merge(district_info_mod, how='inner', left_on='district_id', right_on='district_id')
df_dist_eng_join.head()

In [None]:
# plot a graph to show relation between engagement index and different columns from district df
def districtEngagement(column):
    df = df_dist_eng_join['engagement_index'].groupby(df_dist_eng_join[column]).mean().reset_index()
    df = df.sort_values(['engagement_index'], ascending=False).reset_index()
    plt.figure(figsize=(15,9))
    ax  = sns.barplot(x=df[column], y = df['engagement_index']) 
    for item in ax.get_xticklabels(): item.set_rotation(90)
    for i, v in enumerate(df["engagement_index"].iteritems()):        
        ax.text(i ,v[1], "{:,}".format(v[1]), color='m', va ='bottom', rotation=45)
    plt.title('Plot for engagement index and ' + column)
plt.tight_layout()
plt.show()

In [None]:
districtEngagement('state')

**from above, it shows that:**
* Engagement index is very high for states: New york, North Dekota, District of Columbia etc

In [None]:
# plot a graph to show relation between engagement index and black/hisponic and broadband ration at state level.
def districtEngagement(real_col, new_col1, new_col2):
    df = df_dist_eng_join.groupby(df_dist_eng_join['state'])[['engagement_index', real_col]].mean().reset_index()
    df = df.sort_values(['engagement_index'], ascending=False).reset_index()
    df[new_col1] = df[real_col]*df['engagement_index']
    df[new_col2] = df['engagement_index'] - df[new_col1]
    plt.figure(figsize=(12,6))
    ax = plt.gca()

    sns.lineplot(data=df, x='state', y=new_col1, ax=ax, legend='brief', label=new_col1)
    sns.lineplot(data=df, x='state', y=new_col2, ax=ax, legend='brief', label=new_col2)
    for item in ax.get_xticklabels(): item.set_rotation(90)
    plt.ylabel('Engagement Index')
    plt.plot()

In [None]:
# engagement based on ethnicity
districtEngagement('black/hisp_pct_mean', 'black_engageement_index_pct', 'non_black_engageement_index_pct')

# engagement based on broadband connectivity
districtEngagement('county_connec_ratio_mean', 'high_broadband_conn_engageement_index_pct', 'avg_broadband_conn_engageement_index_pct')

# engagement based on free or reducde prices courses
districtEngagement('free/red_pct_mean', 'free_reduced_engageement_index_pct', 'paid_engageement_index_pct')

**From the above graph, it shows that:**
* Engagement of black/hisponic is more only in District of Columbia and rest everywher non blacks are engaged more.
* Broadband speed in general has not so much of effect for digital engagement.
* free/reduced price engagegments are higher than the paid ones.