In [None]:
!pip install prophet

In [None]:
!pip install xlrd

In [None]:
!pip install openpyxl

In [None]:
import logging 
import glob
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np

import plotly as py
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline

from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

from plotly.offline import plot, iplot, init_notebook_mode 
init_notebook_mode(connected=True)

In [None]:
districts = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
state_policy = pd.read_excel('../input/us-state-policy/COVID-19 US state policy database 9_2_2021.xlsx')
eng_path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
eng_files = glob.glob(eng_path + "/*.csv")

In [None]:
files = []

for file in eng_files:
    df = pd.read_csv(file, index_col = None, header = 0)
    district_id = file.split('/')[4].split('.')[0]
    #print("D_id",district_id)
    df['district_id'] = district_id
    files.append(df)
    
engagement = pd.concat(files)
engagement = engagement.reset_index(drop = True)
engagement['time'] = pd.to_datetime(engagement['time'])


logging.info('Loading data sets done.')

## Covid19 impact on Digital Learning
- Current research shows educational outcomes are far from equitable. The imbalance was exacerbated by the COVID-19 pandemic. There's an urgent need to better understand and measure the scope and impact of the pandemic on these inequities.

- Education technology company LearnPlatform was founded in 2014 with a mission to expand equitable access to education technology for all students and teachers.LearnPlatform’s comprehensive edtech effectiveness system is used by districts and states to continuously improve the safety, equity, and effectiveness of their educational technology. LearnPlatform does so by generating an evidence basis for what’s working and enacting it to benefit students, teachers, and budgets.

- In this analytics competition, I’ll work to uncover trends in digital learning. Accomplish this with data analysis about how engagement with digital learning relates to factors like district demographics, broadband access, and state/national

## Table of Content 
- [Basic Information and Preprocessing](#BasicInformationandPreprocessing)
     1. [Districts](#Districts)
     2. [Products](#Products)
     3. [Engagement](#Engagement)
- [Products Visuals](#ProductsVisuals)
- [Students Activity](#StudentActivity)

## Basic Information and Preprocessing

## Districts 

The district file includes information about the characteristics of school districts:
- **distirict_id**
- **state**- The state where the district resides in
- **locale**- NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information.
- **pct_black/hispanic**- Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data
- **pct_free/reduced**- Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data
- **county_connections_ratio**- ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version)
- **pp_total_raw**- Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.


In [None]:
districts.head(3)

In [None]:
districts.info()

In [None]:
round(len(districts.query("state != state")) / len(districts) * 100, 1)

We have 24.5% of missing information.

In [None]:
districts.dropna(inplace = True)

In [None]:
for i in ['pct_black/hispanic', 'pct_free/reduced']:
    districts[i] = districts[i].apply(lambda x: float(x.split(',')[0][1:]) + 0.1)

districts['pp_total_raw'] = districts['pp_total_raw'].apply(lambda x: int(x.split(',')[0][1:]) + 1000)

districts.drop('county_connections_ratio', axis = 1, inplace = True)

districts.head(3)

## Products 

The products file includes information about the characteristics of the top 372 products with the most users in 2020:
- **LP ID**- The unique identifier of the product
- **URL**- Web Link to the specific product
- **Product Name**- Name of the specific product
- **Provider/Company**- Name of the product provider
- **Sector(s)**- Sector of education where the product is used
- **Primary Essential Function**- The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled


In [None]:
products.head(3)

In [None]:
products.info()

In [None]:
products['Basic_category'] = 'x'
for i in range(len(products)):
    if pd.isna(products['Primary Essential Function'][i]) == False:
        products['Basic_category'][i] = products['Primary Essential Function'][i].split('-')[0][:-1]
        
products.head(3)

## Engagement

The engagement data are aggregated at school district level, and each file in the folder `engagement_data` represents data from one school district. The 4-digit file name represents `district_id` which can be used to link to district information in `district_info.csv`. The `lp_id` can be used to link to product information in `product_info.csv`.

- **time**- date in "YYYY-MM-DD" 
- **lp_id**- The unique identifier of the product
- **pct_access**- Percentage of students in the district have at least one page-load event of a given product and on a given day
- **engagement_index**- Total page-load events per one thousand students of a given product and on a given day |


In [None]:
engagement.head()

In [None]:
engagement.info()

## Districts

In [None]:
#This code has bee taken from:
#https://www.kaggle.com/dmitryuarov/eda-covid-19-impact-on-digital-learning/notebook
state_abb = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

districts['state_abb'] = districts['state'].map(state_abb)

fig = go.Figure()
layout = dict(
    title_text = "Count of districts in the available States",
    title_font = dict(
    #        family = "monospace",
            size = 25,
    #        color = "black"
            ),
    geo_scope = 'usa'
)

fig.add_trace(
    go.Choropleth(
        locations = districts['state_abb'].value_counts().to_frame().reset_index()['index'],
        zmax = 1,
        z = districts['state_abb'].value_counts().to_frame().reset_index()['state_abb'],
        locationmode = 'USA-states',
        marker_line_color = 'white',
        geo = 'geo',
        colorscale = "viridis", 
    )
)
         
fig.update_layout(layout)   
fig.show()




In [None]:
fig = px.bar(districts['state'].value_counts().reset_index(), x = 'index', y = 'state',
            text= districts['state_abb'].value_counts().to_frame().reset_index()['state_abb'],
            labels={'state':'# of districts','index':'State'},
            title="Number of Districts in the Available States")
#fig.update_traces(marker_color='#90afc5')
fig.show()

**Conclusion**
- Utah has the most number of districts from the dataset
- Minnesota and Florida have the least number of districts

In [None]:
# Temporary Dataframe for Checking the Distribution of Locale in every State
temp = pd.crosstab(districts.state, districts.locale)
temp["summation"] = temp.sum(axis=1)
temp["city_percent"] = temp.City*100/temp.summation
temp["rural_percent"] = temp.Rural*100/temp.summation
temp["suburb_percent"] = temp.Suburb*100/temp.summation
temp["town_percent"] = temp.Town*100/temp.summation

# State and locale Distribution Plot
fig = go.Figure()
fig.add_trace(go.Bar(x=temp.index, y=temp.city_percent, name="Percentage City"))
fig.add_trace(go.Bar(x=temp.index, y=temp.rural_percent, name="Percentage Rural"))
fig.add_trace(go.Bar(x=temp.index, y=temp.suburb_percent, name="Percentage Suburb"))
fig.add_trace(go.Bar(x=temp.index, y=temp.town_percent, name="Percentage Town"))

fig.update_layout(title=dict(text="Locale Distribution per State", font_size=20, x=0.5),
                  barmode='stack', 
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1.02,xanchor="right",
                              x=1
                ))


**Conclusion**
- Florida,Michigan,Minnesota, New Jersey and Wisconsi are 100% suburb.
- Most of the data is about suburb locale.

In [None]:
fig = px.bar(districts['locale'].value_counts().reset_index().rename(columns = {'locale': 'count'}),
             x = 'index', y = 'count',
            text=districts['locale'].value_counts().to_frame().reset_index()['locale'],
            title="Number of Districts in Each Type of Area")
#fig.update_traces(marker_color='#90afc5')
fig.show()

**Conclusion**
- 52 Districts have suburb locale.
- Most of the data that we have is about suburb

In [None]:
fig, axes = plt.subplots(1, 2)
sns.kdeplot(data=districts, x="pct_black/hispanic",color='blue' , shade = True, alpha = 0.9, 
            linewidth = 1.5, edgecolor = 'black',ax = axes[0])
sns.kdeplot(data=districts, x="pct_free/reduced",color='blue' , shade = True, alpha = 0.9, 
            linewidth = 1.5, edgecolor = 'black',ax = axes[1])
sns.set(rc={'figure.figsize':(15,9)})
plt.suptitle("Distribution of Characteristics of school districts")


**Conclusion**
- The average number of students who identified themselves as black or hispanic is 23.2%. Thought the most common value is 10%.

- The average number of students elligible for free or reduced price lunch is 38%. Though the most common values are ~30% and ~60%

In [None]:
fig, axes = plt.subplots(1, 2)
sns.kdeplot(data=districts, x="pp_total_raw",color='blue' , shade = True, alpha = 0.9, 
            linewidth = 1.5, edgecolor = 'black',ax = axes[0])
#sns.kdeplot(data=districts, x="pct_free/reduced",color='#90afc5' , shade = True, alpha = 0.9, 
#            linewidth = 1.5, edgecolor = 'black',ax = axes[1])
sns.set(rc={'figure.figsize':(15,9)})
plt.suptitle("Distribution of Characteristics of school districts")



**Conclusion**
- The average Per-pupil total expenditure (sum of local and federal expenditure) is \\$11 204.50

- The most common value is \\$10 000

In [None]:
dist_area_group = districts.groupby('locale').agg({'pct_black/hispanic': 'mean', 'pct_free/reduced': 
                                                   'mean', 'pp_total_raw': 'mean'}).reset_index()

for i in ['pct_black/hispanic','pct_free/reduced','pp_total_raw']:
    fig = px.bar(dist_area_group,x='locale',y = i )
    fig.show()

#dist_area_group

- **Conclusion**
- Most students who identified themselves as black or hispanic reside in cities. A smal percentage reside in the rural or town.
- In cities and Town 50% of the students are eligible for free food or reduced food price.
- The highest total expenses per student is in rural area.

## Products Visuals

In [None]:
fig = px.bar(products['Provider/Company Name'].value_counts().reset_index().head(10), x = 'Provider/Company Name', 
             y = 'index',title='Top 10 of learning Providers/Companys', 
             labels={'index':'Provider/Company Name','Provider/Company Name':''},width=700,height = 700)
fig.show()

**Conclusion**
- Google LLC has the most number of products

In [None]:
fig = px.bar(products['Sector(s)'].value_counts().reset_index(), x = 'index', y = 'Sector(s)',
            text= products['Sector(s)'].value_counts().to_frame().reset_index()['Sector(s)'],
            labels={'Sector(s)':'# of products','index':'Sector(s)'},
            title="Number of Products used in Different Sectors")
fig.show()

**Conclusion**
- There are alot of products that are used in Prek-12.
- Corporate and Higher Ed;Corporate hhave the least number of products.

In [None]:
fig = px.bar(products['Primary Essential Function'].value_counts().reset_index(), y = 'index', 
             x = 'Primary Essential Function',
             labels={'index':'Primary Essential Function','Primary Essential Function':
                    'Number of Products'},title="Count of Products by Subcategory")
fig.show()
#products['Primary Essential Function'].value_counts().reset_index(),
#x = 'Primary Essential Function', y = 'index'

**Conclusion**
- Most of the products primary function is that they are used in 'Digital Learning Platforms'

In [None]:
fig = px.pie(products.query("Basic_category != 'x'")['Basic_category'].value_counts().
             reset_index().rename(columns = {'Basic_category': 'count'}), 
             values = 'count', names = 'index', width = 700, height = 700,
            title="Count of Products by Category")
fig.show()

## Student Activity

In [None]:
merged_data = pd.merge(products, engagement, left_on = 'LP ID', right_on = 'lp_id')
merged_data['district_id'] = merged_data['district_id'].astype('int64')
merged_data = pd.merge(merged_data, districts, on = 'district_id')
merged_data.drop(['URL', 'lp_id', 'state_abb'], axis = 1, inplace = True)
merged_data.head(3)

In [None]:
st_acсess = merged_data.groupby(['state', 'time']).agg({'pct_access': 'mean'}).reset_index()
st_eng = merged_data.groupby(['state', 'time']).agg({'engagement_index': 'mean'}).reset_index()
loc_acсess = merged_data.groupby(['locale', 'time']).agg({'pct_access': 'mean'}).reset_index()
loc_eng = merged_data.groupby(['locale', 'time']).agg({'engagement_index': 'mean'}).reset_index()
cat_acсess = merged_data.groupby(['Basic_category', 'time']).agg({'pct_access': 'mean'}).reset_index()
cat_eng = merged_data.groupby(['Basic_category', 'time']).agg({'engagement_index': 'mean'}).reset_index()

for i in [st_acсess, st_eng, loc_acсess, loc_eng, cat_acсess, cat_eng]:
    i['day_of_week'] = i['time'].dt.dayofweek
    
st_acсess.head(3)

## What is the picture of digital connectivity and engagement in 2020?
- In the year 2020, from January to the beginning of March. The student angagement with digital learning was steady for most of the states.A change can be noticed from 3rd March when Covid19 pandemic was declared as a pandemic. The student engagement with digital learning rose for most of the states.

- The engagemenent with digital learning also took a drop from June to August.This is because most of the schools had closed for summer holidays.This generalizes the time for summer holidays but different schools in the same or different states go for summer holiday at different times.

- The engagement with digital learning rose again after the summer holiday.Covid19 was still an issue acrosss the world and students were still using digital learning platforms.

- New York has the highest digital engagement throught the year. This can because NY was one of the states that took a worst hit of Covid19 and students were forced to use digital learning.

- The plots below provide more detail on how the digital connectivity angagement was in 2020.

## What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

- On 3rd March 2020, when Covid19 was declared as a pandemic by the world health organisation.There was a drop in student activity across the states. The 'pct_access'(Percentage of students in the district have at least one page-load event of a given product and on a given day) dropped for every state during thr first week that the WHO declared Covid19 as a pandemic. It stated to rise during the second week in some states.This sudden dropduring the first week can be attributed to panic by the people.

- After 3 weeks,the student activity picked up for most of the states and it remained 'almost' stable for the rest of the year with a drop during the summer holiday.

- Generally, the student activity increased after Covid19 was declared as a pandemic that before it was delcared.

- The plots below show in detail how the pandemic affected online education for each state,locale and products that are used in digital learning.

## Impact of covid19 on digital learning on States
- **Florida:** The studet activity was stable during the start of 2020 from January to 3rd March 2020.After 3rd March, there was a sharp drop in the studet engament activity.In March, there was very little student activity.The student activity picked up in the beginnig of April.However, it also started to decrease in the month of May and picked up again in September where it remained stable for the rest of the year with a few drops here and there.

- **Illinois:** The studet activity was stable during the start of 2020 from January to Mid February 2020.It increased in mid February and remained stable for the rest of the year with a drop during the summer holiday season.

- **Michigan:** The studet activity was stable during the start of 2020 from January to  the begining of March.From March to August, there is no or very little student activity.The student activity can be noticed at the beginning of September and remained stable for the rest of the year.

- **Minnesota:** The studet activity was stable during the start of 2020 from January to  the begining of March.However,it took a drop at the end of March.It rose again in April and dropped at the end of April.No student activity can be noticed from then on to the end of the year.

- **Missouri:** The studet activity was stable during the start of 2020 from January to Mid February 2020.It increased in mid February and remained stable for the rest of the year with a drop during the summer holiday season.

- **New Jersey:** The studet activity was stable during the start of the year.It increased after covid19 was declared as a pandemic by WHO and remained stable for the rest of the year with a drop during the summer holiday season.

- **New York:** The studet activity was stable during the start of the year.It increased after covid19 was declared as a pandemic by WHO and remained stable for the rest of the year with a drop during the summer holiday season.It had the highest student activity throught the year.

- **North Calorina:** The studet activity was stable during the start of the year until mid March.Very little student activity can be noticed from mid March to the end of September.From end of September,the student activity remained stable until the end of the year.

- **Texas:** The studet activity was stable during the start of the year until the end of February.There is no data for Texas from March to mid May.However,from mid May the student activity dropped during the summer holiday and after the holiday it remained stable for the rest of the year.

- **Utah:** The studet activity was stable for the year 2020. It only took a drop during the summer holidays and continued to be stable for the rest of the year.

- **Virginia:** The studet activity was stable for start of the year from January to the beginning of March.It started dropping from the beginning of March and it contined to  do so until the summer holidays.It rose afte the summer holiday and remained stable for the rest of the year.

- **Washington:** The studet activity was stable for start of the year from January until mid March.It dropped for mid march and rose from the start of April.In April,it started dropping and it continued even durinf the summer holiday.It rose after the summer holiday and remained stable for th rest of the year.

- **Wisconsin:** The studet activity was stable for the year 2020. It only took a drop during the summer holidays and continued to be stable for the rest of the year.

- The plots below show how the student activity (pct_access and engagement index) were for each state in the year 2020. 
- To filter the plotly chart,click twice on the 'state' that you want to see.

In [None]:

#This code has bee taken from:
#https://www.kaggle.com/dmitryuarov/eda-covid-19-impact-on-digital-learning/notebook
fig = px.line(st_acсess, x="time", y="pct_access", color="state", line_group="state")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of pct_access of all products by states', 
                  title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2,tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2,tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=2.7,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=2.25,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            size=11,
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
fig = px.line(st_eng, x="time", y="engagement_index", color="state", line_group="state")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products by states', 
                  title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2,tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2,tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=1150,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            size=11
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=900,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            size=11,
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
months_map = {1:"January",2:"February",3:"March",4:"April",
              5:"May",6:"June",7:"July",8:"August",9:"September",
              10:"October",11:"November",12:"December"}

for i in [st_acсess, st_eng]:
    i['state_abb'] = i['state'].map(state_abb)
    i['month'] = i.time.dt.month.map(months_map)

    fig = px.choropleth(data_frame = i.groupby(['state', 'state_abb', 'month']).agg({i.columns[2]: 'mean'}).reset_index(), locations = "state_abb", locationmode = "USA-states",
                    color = i.groupby(['state', 'state_abb', 'month']).agg({i.columns[2]: 'mean'}).reset_index()[i.groupby(['state', 'state_abb', 'month']).agg({i.columns[2]: 'mean'}).reset_index().columns[3]], scope = "usa",
                    color_continuous_scale = "viridis", animation_frame = "month", hover_name = "state")
    
    fig.update_layout(title_text = f'Monthly dynamics of {i.columns[2]}', title_font = dict(size = 25,color = "black")) 
    
    fig.show()

In [None]:
cov_imp = pd.DataFrame(st_acсess['state'].unique().tolist()).rename(columns = {0: 'state'})

# We have no information about Texas during the start of pandemic
cov_imp = cov_imp.query("state != 'Texas'").reset_index()
cov_imp.drop('index', axis = 1, inplace = True)

for i in ['mean_access', '1w_acess_change%', '2w_acess_change%', 'mean_eng', '1w_eng_change%', '2w_eng_change%']:
    cov_imp[i] = 0.0

states = cov_imp['state'].unique().tolist()

for i in states:
    cov_imp['mean_access'][states.index(i)] = round(st_acсess.query("time >= '2020-03-09' & time <= '2020-03-13' & state == @i")['pct_access'].mean(), 2)
    cov_imp['1w_acess_change%'][states.index(i)] = round((st_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & state == @i")['pct_access'].mean() / cov_imp['mean_access'][states.index(i)] - 1) * 100, 1)
    cov_imp['2w_acess_change%'][states.index(i)] = round((st_acсess.query("time >= '2020-03-23' & time <= '2020-03-27' & state == @i")['pct_access'].mean() / st_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & state == @i")['pct_access'].mean() - 1) * 100, 1)
    cov_imp['mean_eng'][states.index(i)] = round(st_eng.query("time >= '2020-03-09' & time <= '2020-03-13' & state == @i")['engagement_index'].mean(), 1)
    cov_imp['1w_eng_change%'][states.index(i)] = round((st_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & state == @i")['engagement_index'].mean() / cov_imp['mean_eng'][states.index(i)] - 1) * 100, 1)
    cov_imp['2w_eng_change%'][states.index(i)] = round((st_eng.query("time >= '2020-03-23' & time <= '2020-03-27' & state == @i")['engagement_index'].mean() / st_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & state == @i")['engagement_index'].mean() - 1) * 100, 1)

def color_values(val):
    color = 'red' if val < 0 else 'green'
    return 'color: %s' % color

slice_ = ['1w_acess_change%', '2w_acess_change%', '1w_eng_change%', '2w_eng_change%']
slice_2 = ['mean_access', '1w_acess_change%', '2w_acess_change%']
slice_3 = ['mean_eng', '1w_eng_change%', '2w_eng_change%']
cov_imp.style.applymap(color_values, subset = slice_).set_precision(1).set_properties(**{'background-color': '#fafafa'}, subset=slice_2).set_properties(**{'background-color': '#f7f7f7'}, subset=slice_3)

## Impact of covid19 on digital learning on Locale

**Genaral Overview**
- pct_access and engagement index are the metrics that I am using to measure the student activity on digital learning.

- Based on thr data,students from rural areas were more engaged in digital learning while students from cities were least engaged.

- During the first week adter the declaration of Covid19 as pandemic by the WHO,theh indicators od student activity(pct_access and enagement index) decreased by 74.3% and 60% for the student in cities.

- The pct_access in the first week decreased by 32.4% in rural areas compared to the 74.3% in cities.

- Students in rural areas might be more enaged in digital learning because of very long distances to educational institutions hence the need to settle for digital learning.

- Educational institutions in cities wre closed to limit the spread of the pandemic hence we should see an increase in the indicators of student activity on digital learning but that is not the case with this data. This issue cannot be explained using this dataset alone.

- The graphs below show the effect of the covid19 pandemic on the student activity metrics in each locale.

- To filter the plotly chart,click twice on the 'locale' that you want to see.

In [None]:
fig = px.line(loc_acсess, x="time", y="pct_access", color="locale", line_group="locale")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of pct_access of all products by locale', 
                  title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=2,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            size=11
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=1.75,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            size=11
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
fig = px.line(loc_eng, x="time", y="engagement_index", color="locale", line_group="locale")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products by locale', 
                  title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=700,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            size=11
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=610,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            size=11
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
cov_imp2 = pd.DataFrame(loc_acсess['locale'].unique().tolist()).rename(columns = {0: 'locale'})

for i in ['mean_access', '1w_acess_change%', '2w_acess_change%', 'mean_eng', '1w_eng_change%', '2w_eng_change%']:
    cov_imp2[i] = 0.0

locales = cov_imp2['locale'].unique().tolist()

for i in locales:
    cov_imp2['mean_access'][locales.index(i)] = round(loc_acсess.query("time >= '2020-03-09' & time <= '2020-03-13' & locale == @i")['pct_access'].mean(), 2)
    cov_imp2['1w_acess_change%'][locales.index(i)] = round((loc_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & locale == @i")['pct_access'].mean() / cov_imp2['mean_access'][locales.index(i)] - 1) * 100, 1)
    cov_imp2['2w_acess_change%'][locales.index(i)] = round((loc_acсess.query("time >= '2020-03-23' & time <= '2020-03-27' & locale == @i")['pct_access'].mean() / loc_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & locale == @i")['pct_access'].mean() - 1) * 100, 1)
    cov_imp2['mean_eng'][locales.index(i)] = round(loc_eng.query("time >= '2020-03-09' & time <= '2020-03-13' & locale == @i")['engagement_index'].mean(), 1)
    cov_imp2['1w_eng_change%'][locales.index(i)] = round((loc_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & locale == @i")['engagement_index'].mean() / cov_imp2['mean_eng'][locales.index(i)] - 1) * 100, 1)
    cov_imp2['2w_eng_change%'][locales.index(i)] = round((loc_eng.query("time >= '2020-03-23' & time <= '2020-03-27' & locale == @i")['engagement_index'].mean() / loc_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & locale == @i")['engagement_index'].mean() - 1) * 100, 1)

cov_imp2.style.applymap(color_values, subset = slice_).set_precision(1).set_properties(**{'background-color': '#fafafa'}, subset=slice_2).set_properties(**{'background-color': '#f7f7f7'}, subset=slice_3)


## Impact of covid19 on digital learning on Product Category 

**Genaral Overview**
- There is a decline in L & C(Learning and Curriculum) which is aimed at teaching school childreen. The school children might have seen the pandemic as a time to relax hence the decline

- There is an increase in CM (Classroom Management).Adults might have seen digital learning as a opportunity to get additional education. CM had and increase of 8% in the first week ands 12% in the second week.

- The graphs below show the effect of the covid19 pandemic on each product category.

- To filter the plotly chart,click twice on the 'locale' that you want to see.

In [None]:
fig = px.line(cat_acсess.query("Basic_category != 'x'"), x="time", y="pct_access", color="Basic_category", line_group="Basic_category")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of pct_access of all products by product category', 
                  title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2,tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=4.5,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            size=11,
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=3.8,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            family="monospace",
            size=11,
            color="black"
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
fig = px.line(cat_eng.query("Basic_category != 'x'"), x="time", y="engagement_index", color="Basic_category", line_group="Basic_category")

fig.update_layout(plot_bgcolor = 'white', title = 'Dynamics of engagement index of all products by product category', 
                  title_font_color = '#221f1f', title_font_size = 20, title_x = 0.5)
fig.update_xaxes(showline = True, linecolor = '#f5f2f2', linewidth = 2, tickfont_family = 'monospace', tickfont_color = '#221f1f', tickfont_size = 12)
fig.update_yaxes(showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2,tickfont_color = '#221f1f', tickfont_size = 12)

fig.add_vline(x = '2020-03-11', line_width = 3, line_color="red")

fig.add_annotation(
        x='2020-03-11',
        y=1900,
        text="WHO has declared Covid-19 a pandemic",
        showarrow=True,
        font=dict(
            size=11
            ),
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax= 130,
        ay=1
        )

fig.add_vrect(x0="2020-06-01", x1="2020-08-31", fillcolor="yellow", opacity=0.25, line_width=0)

fig.add_annotation(
        x='2020-07-15',
        y=1600,
        text="Summer holidays",
        showarrow=False,
        font=dict(
            size=11
            )
        )

fig.update_traces(line_width=1)

fig.show()

In [None]:
cov_imp3 = pd.DataFrame(cat_eng.query("Basic_category != 'x'")['Basic_category'].unique().tolist()).rename(columns = {0: 'Basic_category'})

for i in ['mean_access', '1w_acess_change%', '2w_acess_change%', 'mean_eng', '1w_eng_change%', '2w_eng_change%']:
    cov_imp3[i] = 0.0

categories = cov_imp3['Basic_category'].unique().tolist()

for i in categories:
    cov_imp3['mean_access'][categories.index(i)] = round(cat_acсess.query("time >= '2020-03-09' & time <= '2020-03-13' & Basic_category == @i")['pct_access'].mean(), 2)
    cov_imp3['1w_acess_change%'][categories.index(i)] = round((cat_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & Basic_category == @i")['pct_access'].mean() / cov_imp3['mean_access'][categories.index(i)] - 1) * 100, 1)
    cov_imp3['2w_acess_change%'][categories.index(i)] = round((cat_acсess.query("time >= '2020-03-23' & time <= '2020-03-27' & Basic_category == @i")['pct_access'].mean() / cat_acсess.query("time >= '2020-03-16' & time <= '2020-03-20' & Basic_category == @i")['pct_access'].mean() - 1) * 100, 1)
    cov_imp3['mean_eng'][categories.index(i)] = round(cat_eng.query("time >= '2020-03-09' & time <= '2020-03-13' & Basic_category == @i")['engagement_index'].mean(), 1)
    cov_imp3['1w_eng_change%'][categories.index(i)] = round((cat_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & Basic_category == @i")['engagement_index'].mean() / cov_imp3['mean_eng'][categories.index(i)] - 1) * 100, 1)
    cov_imp3['2w_eng_change%'][categories.index(i)] = round((cat_eng.query("time >= '2020-03-23' & time <= '2020-03-27' & Basic_category == @i")['engagement_index'].mean() / cat_eng.query("time >= '2020-03-16' & time <= '2020-03-20' & Basic_category == @i")['engagement_index'].mean() - 1) * 100, 1)

cov_imp3.style.applymap(color_values, subset = slice_).set_precision(1).set_properties(**{'background-color': '#fafafa'}, subset=slice_2).set_properties(**{'background-color': '#f7f7f7'}, subset=slice_3)


## Predicting Student Activity for 2021

- Used Facebook prohet to predict the Student Activity (pct_access and engagement index).

In [None]:

st_acсess.head(3)

In [None]:
#get the pct access  and time frame 
pct_access = st_acсess[['time','pct_access']]

#rename the columns 
pct_access.columns = ['ds','y']

#turn 'ds' to datetime 
pd.to_datetime(pct_access['ds'])

pct_access

In [None]:
pct_access.dtypes

In [None]:
m = Prophet()
m.fit(pct_access)

In [None]:
future = m.make_future_dataframe(periods=365)
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
plot_plotly(m, forecast)

In [None]:
plot_components_plotly(m, forecast)