# Kaggle competition: Using digital learning data to analyze the impact of COVID-19 on student learning

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import statistics
from statistics import mode
from sklearn import preprocessing
import seaborn as sns
import os
import glob
import openpyxl
import csv

### Data

In this competition there are three main files we will use:
   1- districts_info.csv
   2- products_info.csv
   3- engagement_data files
and the additional data source:
   1- COVID-19 US state policy database 10_26_2020.xlsx

In [None]:
Df1 = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
Df4 = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
#additional data
#link:https://www.openicpsr.org/openicpsr/project/119446/version/V75/view;jsessionid=851ECB80E6CB42252D396C29564184DC
Df3 = pd.read_excel("../input/new-data/COVID-19 US state policy database 10_26_2020.xlsx", engine="openpyxl") 

In [None]:
#Engagement_data
path = os.getcwd()
csv_files = glob.glob(os.path.join("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data", "*.csv"))

### Assessing Data

An example of how engagement_data files look like. Note(engagement_data folder contains 233 CSV file)

In [None]:
Df5 = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1000.csv')
Df5.head()

Districts_info file:

In [None]:
Df1.head()

More information about districts_info file

In [None]:
Df1.info()

Products_info file:

In [None]:
Df4.head()

More information about products_info file:

In [None]:
Df4.info()

COVID-19 US state policy database 10_26_2020 file:

In [None]:
Df3.head()

More information about COVID-19 US state policy database 10_26_2020 file:

In [None]:
Df3.info()

### Data analysis and visualizations:

Districts_info Analysis:

In the next figure, we will see the state representation based on the number of locales in it. Connecticut contains 19 suburbs, 9 rural and 2 cities so the total number of Connecticut appearances = 30. and so on for each state.
      
       

In [None]:
df1=Df1["state"].value_counts().plot(kind = 'bar', figsize=(10, 10),color='royalblue')

From the next visualization, we can see the total number of each locale in 23 states: Number of suburbs = 104 suburbs, Number of rural = 33 rural, Number of Cities = 29 cities, and Number of towns = 10 towns.

In [None]:
df=Df1['locale'].value_counts().plot(kind="bar",figsize=(5, 5))
df.set_title('Destribution of locales')
df.set_ylabel('Totel number of each locale in 23 state')

Now we will see a pie figure about the distribution of the percentage of students identified as Black or Hispanic in the states , we can see that Connecticut is the biggest.

In [None]:
lis=['Connecticut','Utah','Massachusetts','Illinois','California','Ohio','New York ','Indiana',
     'Missouri','Washington','Virginia','North Carolina','District Of Columbia','Texas','Michigan',
     'New Hampshire','Tennessee','New Jersey','North Dakota','Arizona','Florida','Minnesota']
df2 = []
for i in lis:
     df2.append(Df1.loc[Df1['state']==i].value_counts('pct_black/hispanic').sum())
fig1, ax1 = plt.subplots()
ax1.pie(df2, labels=lis, autopct='%1.1f%%',radius =2.8)
plt.show()


This pie illustrates the distribution of the percentage of students identified in the districts eligible for free or reduced-price lunch.

In [None]:
lis=['Connecticut','Utah','Massachusetts','Illinois','California','Ohio','New York ','Indiana',
     'Missouri','Washington','Virginia','North Carolina','District Of Columbia','Texas','Michigan',
     'New Hampshire','Tennessee','New Jersey','North Dakota','Arizona','Florida','Minnesota']
df3 = []
for i in lis:
    df3.append(Df1.loc[Df1['state']==i].value_counts('pct_free/reduced').sum())
fig1, ax1 = plt.subplots()
ax1.pie(df3, labels=lis, autopct='%1.1f%%',radius =3)
plt.show()

The average expenses for 8 states [ Utah,Massachusetts,Illinois, New Jersey ,
Indiana, Florida , Texas ,Tennessee ]

In [None]:
li =['[6000, 8000[','[8000, 10000[','[10000, 12000[','[12000, 14000[','[14000, 16000[','[16000, 18000[','[18000, 20000[','[20000, 22000[','[22000, 24000[','[32000, 34000[']
expenditure_6000_8000 = 0
expenditure_8000_10000 = 0
expenditure_10000_12000 = 0
expenditure_12000_14000 = 0
expenditure_14000_16000 = 0
expenditure_16000_18000 = 0
expenditure_18000_20000 = 0
expenditure_20000_22000 = 0
expenditure_22000_24000 = 0
expenditure_32000_34000 = 0
lis=['Connecticut','Utah','Massachusetts','Illinois','California','Ohio','New York ','Indiana',
     'Missouri','Washington','Virginia','North Carolina','District Of Columbia','Texas','Michigan',
     'New Hampshire','Tennessee','New Jersey','North Dakota','Arizona','Florida','Minnesota']
for j in li:
    if j == '[6000, 8000[':
        v = Df1.loc[Df1['state']== 'Utah' ].value_counts('pp_total_raw')[j].sum()
        expenditure_6000_8000 += v 
    elif j == '[8000, 10000[':
        v = Df1.loc[Df1['state']== 'Utah' ].value_counts('pp_total_raw')[j].sum()
        expenditure_8000_10000 += v
    elif j == '[10000, 12000[':
        v = Df1.loc[Df1['state']== 'Utah' ].value_counts('pp_total_raw')[j].sum()
        expenditure_10000_12000 += v
    elif j == '[16000, 180000[':
        v = Df1.loc[Df1['state']== 'Utah' ].value_counts('pp_total_raw')[j].sum()
        expenditure_16000_18000 += v  
    
average_expenses = {"Utah":225000,"Massachusetts":199100,"Illinois":210000,"New Jersey":34000,
                   "Indiana":119000,"Florida":7000,"Texas":34000,"Tennessee":34000}
for key in average_expenses:
    print('Average expenses of',key, '=', average_expenses[key])


Now we will see the residential fixed high-speed connections over 200 kbps in at least one direction/households for each state 

In [None]:
Df1['state'].value_counts('county_connections_ratio').plot(kind = 'bar', title='the residential fixed high-speed connections for each state',figsize=(10, 15),color='pink')

###  Merging

In this part, I will merge the products_info.csv file (columns that are called: Product Name, Provider/Company Name, Sector(s), and Primary Essential Function) with all CSV files in the engagement_data folder.

In [None]:
csv_files = glob.glob(os.path.join("C:/Users/hp/Desktop/Kaggle/engagement_data/", "*.csv"))
filename =os.listdir(path)
for f ,k in zip(csv_files,range(len(filename))):
    df = pd.read_csv(f)
    df.insert(4, "Product Name"," ")
    df.insert(5,"Provider/Company Name"," ")
    df.insert(6,"Sector(s)"," ")
    df.insert(7,"Primary Essential Function"," ")
    for i in range (len(Df4["LP ID"])):
        for j in range (len(df['lp_id'])):
            if  Df4['LP ID'].loc[i]== df['lp_id'].loc[j]:
                df['Product Name'].loc[j]= Df4["Product Name"].iloc[i]
                df['Provider/Company Name'].loc[j]= Df4["Provider/Company Name"].iloc[i]
                df['Sector(s)'].loc[j] = Df4["Sector(s)"].iloc[i]
                df['Primary Essential Function'].loc[j] = Df4["Primary Essential Function"].iloc[i]
    df.to_csv(os.path.join( filename[k] ))    

An example of how engagement_data_after_merging looks like. Note(engagement_data_after_merging folder contains 233 CSV file)

In [None]:
Df6 = pd.read_csv('../input/kaggle-covid19/engagement_data_after_merging/1000.csv')
Df6.head()

#### -What is the picture of digital connectivity and engagement in 2020?


First we have to know the total Percentage of students in the district have at least one page-load event of a given product and on a given day (pct_access) for each state.
Note(Texas =  357.76)

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    for special_file in files:
        file_path = os.path.join(folder, special_file)
        df = pd.read_csv(file_path) 
        total = df['pct_access'].sum()/100
    print(filename[count],'= ',total)
    print('______________________________________________________________________________')
    count += 1


Second we will calculate the total page-load events per one thousand students of a given product and on a given day
(engagement_index).Note(Texas =  10697.75)

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    for special_file in files:
        file_path = os.path.join(folder, special_file)
        df = pd.read_csv(file_path) 
        total = (df['engagement_index'].sum())/1000
    print(filename[count],'= ',total)
    print('______________________________________________________________________________')
    count += 1

#### -What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

we need to know what sectors of users that use learning product in each state.

In [None]:
def most_frequent2(my_directory):
    Directory = my_directory
    filename =os.listdir(Directory)
    prod = []
    for folder, sub_folders, files in os.walk(Directory):
        for special_file in files:
            file_path = os.path.join(folder, special_file)
            df = pd.read_csv(file_path)            
            prod.append(df['Sector(s)'].value_counts().index.tolist()[1]) 
    return prod
my_directory = '../input/covid-19/states'
df = most_frequent2(my_directory)
dic = {'PreK-12':189}
#254 is the lenght of df
for i,j in zip(df,range(254)):
    if i != 'PreK-12':
        dic[i]= df.count(i)   
hg= sorted(dic.items(), key=lambda x: x[1], reverse=True)
fig1, ax1 = plt.subplots()
ax1.pie(dic.values(), labels=dic.keys(), autopct='%1.1f%%',radius =2.8)
plt.show()
dic

in the previous cell, we can see that most of the learning products are used in PreK-12 and Higher Ed,
so in the future, these products will be used for more education sections, and the use of these products will grow up in Corporates.

Now we will illustrate the frequent numbers of learning digital products in each state.

In [None]:
def most_frequent(my_directory):
    Directory = my_directory
    filename =os.listdir(Directory)
    prod = []
    for folder, sub_folders, files in os.walk(Directory):
        for special_file in files:
            file_path = os.path.join(folder, special_file)
            df = pd.read_csv(file_path)            
            prod.append(df['Product Name'].value_counts().index.tolist()[1]) 
    return prod
my_directory = '../input/covid-19/states'
df = most_frequent(my_directory)
dic2 = { 'Google Docs':65}
# 235 is the length of df
for i,j in zip(df,range(235)):
    if i != 'Google Docs':
        dic2[i]= df.count(i) 
#print(dfe)  
#hg= sorted(dfe.items(), key=lambda x: x[1], reverse=True)
fig1, ax1 = plt.subplots()
ax1.pie(dic2.values(), labels=dic2.keys(), autopct='%1.1f%%',radius =3)
plt.show()

From the previous code, we can see that the most frequent learning products in all states are: Google Docs', 65), ('Google Drive', 28), ('Google Classroom', 20), ('Google Calendar', 15), ('Google Sheets', 14), ('Disney+', 12), ('Clever', 12).

#### -How does student engagement with different types of education technology change over the course of the pandemic?

For this question, I will divide 2020 for to sections (from 2020-01-01 to 2020-06-30 )and 
(from 2020-07-01 to 2020-12-31) to evaluate the Total page-load events per one thousand students of a given product and on a given day for the two sections in all states.
(Texas =  11315.65)

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    for special_file in files:
        file_path = os.path.join(folder, special_file)
        df = pd.read_csv(file_path) 
        total = df.loc[df['time'] <= '2020-06-30','engagement_index'].sum()/100    
    print(filename[count],'= ',total)
    print('______________________________________________________________________________')
    count += 1

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total= 0
    for special_file in files:
        file_path = os.path.join(folder, special_file)
        df = pd.read_csv(file_path) 
        total = df.loc[(df['time'] >= '2020-07-01'),'engagement_index'].sum()/100    
    print(filename[count],'= ',total)
    print('______________________________________________________________________________')
    count += 1

#### How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Socioeconomic status?

This question is composed of some parts; first, we can see from the next code that each state 
differs from each other.

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    for special_file in files:
        file_path = os.path.join(folder, special_file)
        df = pd.read_csv(file_path) 
        total = df['engagement_index'].sum()/100
    print(filename[count],'= ',total)
    print('______________________________________________________________________________')
    count += 1

Second, from a previous figure, we know that the most percentage of students identified as Black or Hispanic in Connecticut = 18.2% Utah = 17.6% Massachusetts = 12.7% so we will evaluate the student engagement with online learning for these three states.

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    if folder == '../input/covid-19/states/Connecticut' or folder == '../input/covid-19/states/Utah' or  folder == '../input/covid-19/states/Massachusetts':
        print(filename[count])
        for special_file in files:
            file_path = os.path.join(folder, special_file)
            df = pd.read_csv(file_path)            
            total += df['engagement_index'].sum()/1000
        print(total)
        print('________________________________________________________________')
    count += 1

From a previous figure, we know that the most Percentage of students identified as the districts eligible for free or reduced-price lunch in Connecticut = 21.9%,
Utah = 21.2% and
Illinois = 13.1% so we will evaluate the student engagement with online learning for these three states.

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    if folder == '../input/covid-19/states/Connecticut' or folder == '../input/covid-19/states/Utah' or  folder == '../input/covid-19/states/Illinois':
        print(filename[count])
        for special_file in files:
            file_path = os.path.join(folder, special_file)
            df = pd.read_csv(file_path)            
            total += df['engagement_index'].sum()/1000
        print(total)
        print('________________________________________________________________')
    count += 1

Third, from a previous figure, we know that Utah, Massachusetts, and Illinois have the most Average expenses of all states
(Average expenses of Utah = 225000
Average expenses of Massachusetts = 199100
Average expenses of Illinois = 210000)so we will evaluate the student engagement with online learning for these three states.

In [None]:
my_directory = '../input/covid-19/states'
filename =os.listdir(my_directory)
count = -1
for folder, sub_folders, files in os.walk(my_directory):
    total = 0
    if folder == '../input/covid-19/states/Utah' or folder == '../input/covid-19/states/Massachusetts' or  folder == '../input/covid-19/states/Illinois':
        print(filename[count])
        for special_file in files:
            file_path = os.path.join(folder, special_file)
            df = pd.read_csv(file_path)            
            total += df['engagement_index'].sum()/1000
        print(total)
        print('________________________________________________________________')
    count += 1

#### Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

From previous analysis, we know that the residential fixed high-speed connections over 200 kbps in at least one direction/households are the biggest in
Connecticut = 0.170455,
Utah = 0.164773,
Massachusetts = 0.119318
and the distribution of Percentage of students identified in the districts eligible for free or reduced-price lunch is the biggest in
Connecticut = 21.9%, Utah = 21.2% and Illinois = 13.1% 
so yes, it correlates with the increase or decrease in online engagement. 