In [None]:
import pandas as pd
import numpy as np
from datetime import date
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
pd.options.display.max_columns = None
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import wandb
import plotly.graph_objects as go

In [None]:
data_root = '../input/learnplatform-covid19-impact-on-digital-learning'

In [None]:
districts_info = pd.read_csv(os.path.join(data_root, 'districts_info.csv'))

In [None]:
districts_info.rename(columns = {'district_id':'districtId'}, inplace = True)

In [None]:
districts_info.head()

In [None]:
products_info = pd.read_csv(os.path.join(data_root, 'products_info.csv'))

In [None]:
products_info.rename(columns = {'LP ID':'lpId'}, inplace = True)

In [None]:
products_info.head()

In [None]:
def majorCsv():
    PATH = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
    dfMain = pd.DataFrame()
    for number in range(1001,110000):
        try:
            df = pd.read_csv(f"{PATH}/{number}.csv")
            #df = pd.read_csv(os.path.join(engagement_data, os.listdir(engagement_data)[number]))
            df['districtId'] = number
            dfMain= dfMain.append(df)
        except:
            pass  
    return dfMain

In [None]:
df = majorCsv()

In [None]:
df.head()

In [None]:
df.rename(columns = {'lp_id':'lpId'}, inplace = True)

In [None]:
df = pd.merge(df,products_info,on=['lpId'],how='left')

In [None]:
df = pd.merge(df,districts_info,on=['districtId'],how='left')

In [None]:
df['month'] = pd.DatetimeIndex(df['time']).month
df.rename(columns ={"pct_black/hispanic":"pctBlackHispanic"}, inplace=True)
df.rename(columns ={"pct_free/reduced":"pctFreeReduced"}, inplace=True)
df.rename(columns ={"Provider/Company Name":"providerCompany"}, inplace=True)
df.rename(columns ={"Sector(s)":"sector"}, inplace=True)
df.rename(columns ={"Product Name":"productName"}, inplace=True)
df.rename(columns ={"pct_access":"percentAccess"}, inplace=True)
df.rename(columns ={"county_connections_ratio":"countyConnRatio"}, inplace=True)
df.rename(columns ={"engagement_index":"engagementIndex"}, inplace=True)
df.rename(columns ={"Primary Essential Function":"primaryEssentialFunction"}, inplace=True)
df['countyConnRatio'] = df['countyConnRatio'].str.replace('[', '')
df.rename(columns ={"pp_total_raw":"pupilExpenditure"}, inplace=True)

Getting random 50 percent data to better perform our analytics

In [None]:
part_50 = df.sample(frac = 0.5)
df1 = df.drop(part_50.index)

In [None]:
df1.head()

#### For the columns <u>Pupil Expenditure</u> & <u>Percent Free reduced</u> we are removing square brackets and taking average of range

In [None]:
df1['pupilExpenditure'] = df1['pupilExpenditure'].str.replace('[', '')
tempdf = pd.concat([df1['pupilExpenditure'].str.split(', ', expand=True)], axis=1)
tempdf[0] = tempdf[0].astype(float)
tempdf[1] = tempdf[1].astype(float)
tempdf['pupilExpenditure'] = tempdf[0] + tempdf[1] / 2 
df1['pupilExpenditure'] = tempdf['pupilExpenditure']

In [None]:
df1['pctFreeReduced'] = df1['pctFreeReduced'].str.replace('[', '')
tempdf = pd.concat([df1['pctFreeReduced'].str.split(', ', expand=True)], axis=1)
tempdf[0] = tempdf[0].astype(float)
tempdf[1] = tempdf[1].astype(float)
tempdf['pctFreeReduced'] = (tempdf[0] + tempdf[1]) /2
df1['pctFreeReduced'] = tempdf['pctFreeReduced']

### Nan Values in Data

In [None]:
msno.bar(df,color='#FD4848', sort="ascending", figsize=(10,5), fontsize=12)
plt.title("Nan Value Ratio",font="Serif", size=20)
plt.show()

Lot of <b>missing data</b> observerd in columns like Sector, Primary essential function, Product name, State, Locale which might slightly affect answers, we'll try to reach best consclusions and propose best fit hypothsis of a given sittuation in each question

## Question 1.
#### What is the picture of digital connectivity and engagement in 2020?
we'll create temporary dataframe with question specific columns

In [None]:
Engdf = df1[['month','engagementIndex']]
Engdf = Engdf.groupby('month').mean()['engagementIndex']
Engdf = pd.DataFrame(Engdf)
Engdf.reset_index(inplace = True)
plt.figure(figsize=(12, 4))
sns.lineplot(x = "month", y = "engagementIndex", data = Engdf)
plt.grid()
plt.title("Figure 1. - Picture of Average Engagement Index over the year" ,font = {'family': 'Tahoma','weight': 'normal','size': 20,})

In [None]:
Perdf = df1[['month','percentAccess']]
Perdf = Perdf.groupby('month').mean()['percentAccess']
Perdf = pd.DataFrame(Perdf)
Perdf.reset_index(inplace = True)
plt.figure(figsize=(12, 4))
sns.lineplot(x = "month", y = "percentAccess", data = Perdf)
plt.grid()
plt.title("Figure 2. - Picture of Average Percent Access over the year" ,font = 'Tahoma',size =  20)
plt.show()

### Answer 1:
To answer the question Figure 1 & 2 gave us a perfect overviews of situation digital connectivity and engagement of 2020

#### Our hypoothesis:
Engagement index of 2020, what we can interpret from the fig. 1 is: 
- Engagement was less in the month of starting of january, which can be directly linked to post Christmas 2019 vaccation time.
- Than a constant rise is observed in period of January to April Engagement Recovering the post Vaccation period.
- We observe a dip in Engagement which starts from April and get's to it's lowest in the month of July which is exactly when Covid-19 was on rise, Lockdowns were imposed , offices were shut and overall engagement of students and offices was on decline.
- Then comes the july to september part people setting with all the chaos covid created intially, offices started work from homes schools started distance learning programs, people adoptiong fast to new norms so a sharp increase in engagement observed in the period.
- afterwards for the rest of year engagement was stable at upper circuit people have fully switched to online learning and work from home programs.

For the digital  connectivity part:
- Percent access as in fig.2 was averaging at a mark of 0.6 initially, it is showing access to people who already are dependent  on online learing platforms for their education.
- Dip of percent access in the period of march to july is be due to schools shut downs and covid rise.
- Then again rise of percent access, new norms of distance learning people moving to online platforms for their education.

## Question2.
#### What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

Getting Dataframe of only question specific columns, then getting only first layer of essential functions from primary essential fuction columns i.e. LC, SDO, CM etc, to better categorise and perform analytics to get our answer.

In [None]:
mydf = df1[['month','sector','engagementIndex','productName','percentAccess','primaryEssentialFunction']]
mydf['primaryEssentialFunction'] = mydf['primaryEssentialFunction'].str.split('-').str[0]

In [None]:
mydf.head()

Most popular <b> Product Function </b> over the year

In [None]:
print("Count of Learning & Curriculum (LC) : " + str(len(mydf[mydf['primaryEssentialFunction']=='LC '])))
print("Count of School & District Operations. (SDO) : " + str(len(mydf[mydf['primaryEssentialFunction']=='SDO '])))
print("Count Of LC/CM/SDO  : " + str(len(mydf[mydf['primaryEssentialFunction']=='LC/CM/SDO '])))
print("Count Of Classroom Management (CM) : " + str(len(mydf[mydf['primaryEssentialFunction']=='CM '])))
print("Count Of nan : " + str(len(mydf[mydf['primaryEssentialFunction'].isnull()])))

In [None]:
mydf1 = mydf.groupby(['month','primaryEssentialFunction']).mean()['percentAccess']
mydf1.unstack().plot(figsize=(12, 8) ,title="Figure 3. - Avg. Percent Access in different different primary essention functions of product over the year")
plt.ylabel('Percent Access')
plt.grid()
plt.show()

In [None]:
mydf2 = mydf.groupby(['month','primaryEssentialFunction']).mean()['engagementIndex']
mydf2.unstack().plot(figsize=(12, 8) ,title="Figure 4. - Avg. Engagement index in different primary essention functions of product over the year")
plt.ylabel('Engagement Index')
plt.grid()
plt.show()

In [None]:
mydf3 = mydf.groupby(['month','sector']).mean()['engagementIndex']
mydf3.unstack().plot(figsize=(12, 8),title="Figure 5. - Avg. Engagement index by different Sectors over the Year")
plt.ylabel('engagement Index')
plt.grid()
plt.show()

In [None]:
mydf4 = mydf.groupby(['month','sector']).mean()['percentAccess']
mydf4.unstack().plot(figsize=(12, 8) ,title="Figure 6. - Avg. Percent Access in different Sectors over the Year")
plt.ylabel('Percent Access')
plt.grid()
plt.show()

In [None]:
cm = mydf[mydf['primaryEssentialFunction'] == "CM "]  

In [None]:
cm['productName'].unique()

### Answer 2. 
 - Technologies used for distance learning have higher adoption and consistent engagement, after a sharp dip in July schools and corporates started moving towards online technologies, of which <b>Classroom Management(CM)</b> tools in fig. 3 & 4 show the highest level ranging from 300 in april to almost 500 in september and stayed there till december.
 - Technologies common to prek12, higher edu, And corporate shows highest level of engagement from August onwards and similarly classroom management techs like google meet, zoom and others as seen in above list have shown higher adoption since august session.
 - While for <b>future</b> after reading figure 6. for the Covid times we can say, technologies specific to corporate sector did'nt bounced back well after work from home restrictions to pre covid levels, while technologies common to Prek12, higher education and corporate will see a increase if follow the same trend as shown in months of november december.

## Question3.

#### How does student engagement with different types of education technology change over the course of the pandemic?

Here We'll first chop data into 2 Parts:- pre-covid period <b>(January - March)</b> and lockdowns period <b>(August - December)</b> to compare ad  understand engagement change before and during covid period

In [None]:
preLock = df1[df1['month']<4]
preLock = preLock[['percentAccess','productName','engagementIndex']]
preLock1 =preLock.groupby('productName').mean()
preLock1 =preLock1.sort_values('percentAccess',ascending=False)
preLock1.reset_index(inplace=True)
plt.figure(figsize=(12, 4))
plt.xticks(rotation=70)
sns.barplot(x="productName", y="percentAccess", data=preLock1[:15],palette="Blues_d")
plt.title("Figure 7. - Top 15 Products Percent Access in Before covid period January - April " ,font = {'family': 'Tahoma','weight': 'normal','size': 20,})

In [None]:
preLock2 =preLock.groupby('productName').mean()
preLock2 =preLock2.sort_values('engagementIndex',ascending=False)
preLock2 = pd.merge(preLock2,mydf[['productName','primaryEssentialFunction']],on=['productName'],how='left')
preLock2.reset_index(inplace=True)
preLock2 =preLock2.drop_duplicates(subset=['productName'] ,keep='first')
del preLock2['index']
plt.figure(figsize=(12, 4))
plt.xticks(rotation=70)
sns.barplot(x="productName", y="engagementIndex", data=preLock2[:15],palette="Blues_d")
plt.title("Figure 7. - Top 15 Products Engagement in Before covid period January - April " ,font = {'family': 'Tahoma','weight': 'normal','size': 20,})

In [None]:
display(preLock2[:60])

### After Lockdown

In [None]:
Lockdown = df1[df1['month']>=8]
Lockdown = Lockdown[['percentAccess','productName','engagementIndex']]
Lockdown1 = Lockdown.groupby('productName').mean()
Lockdown1 = Lockdown1.sort_values('percentAccess',ascending=False)
Lockdown1.reset_index(inplace=True)
plt.figure(figsize=(12, 4))
plt.xticks(rotation=70)
sns.barplot(x="productName", y="percentAccess", data=Lockdown1[:15],palette="Blues_d")
plt.title("Figure 8. - Top 15 Products Percent Access in lockdown period August - December " ,font = {'family': 'Tahoma','weight': 'normal','size': 20,})

In [None]:
Lockdown2 = Lockdown.groupby('productName').mean()
Lockdown2 = Lockdown2.sort_values('engagementIndex',ascending=False)
Lockdown2 = pd.merge(Lockdown2,mydf[['productName','primaryEssentialFunction']],on=['productName'],how='left')
Lockdown2.reset_index(inplace=True)
Lockdown2 = Lockdown2.drop_duplicates(subset=['productName'] ,keep='first')
plt.figure(figsize=(12, 4))
plt.xticks(rotation=70)
sns.barplot(x="productName", y="engagementIndex", data=Lockdown2[:15],palette="Blues_d")
plt.title("Figure 7. - Top 15 Products Engagement in Lockdown period August - December" ,font = {'family': 'Tahoma','weight': 'normal','size': 20,})

In [None]:
display(Lockdown2[:60])

In [None]:
def overtime(df1,tech):
    df = df1[df1['productName'] ==tech]
    df = df[['month','percentAccess']]
    df = df.groupby('month').mean()
    df.reset_index(inplace = True)
    plt.figure(figsize=(12, 4))
    sns.lineplot(x = "month", y = "percentAccess", data = df, dashes=False)
    plt.title( f"{tech} access timeline",font="Serif", size=20)
    plt.grid()

### Answer3. 
- In the post covid new session(August onwards), shows increase in engagement in <b>Learning & Circulum technologies</b> like google Docs, youtube, canvas etc. While youtube saw a massive jump in the same period as it provides learning content together with google classrom, google suite people are able to switch to a fully online learning experience, to properly answer our question let's visualise  engagement overtime of education technology which came in top 15 in covid period i.e : YouTube, Zoom

In [None]:
overtime(df1,"YouTube")
overtime(df1,"Zoom")

- So above Figures Evidently show that the technologies like Zoom : Which offers online class & meetings facilities  and Youtube which offers online courses for learning saw a upward trend throughout the year displaying the movement of people to online learning techs.

## Question4.
#### How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

We'll be styuding Impact on engagement of <b>Black/Hispanic</b> , we'll be studying different aspects according to their population density in an area

In [None]:
ans4 = df1[['month','pctBlackHispanic','engagementIndex','percentAccess','locale','pctFreeReduced','pupilExpenditure']]
ans41 = ans4.groupby(['month','pctBlackHispanic']).mean()['engagementIndex']
ans41= ans41.unstack()
plt.figure(figsize=(18, 6))
sns.lineplot(data=ans41,palette="rocket_r",dashes=False)
plt.title("Figure 8. - Monthly percent BlackHispanic Function engagement Index" ,font = {'family': 'Tahoma','weight': 'normal','size': 20})
plt.ylabel('percent Black Hispanic')
plt.grid()
plt.show()

In [None]:
ans42 = ans4.groupby(['locale','pctBlackHispanic']).mean()
ans42.reset_index([0,1],inplace=True)
ans42.pivot(index='pctBlackHispanic', columns='locale', values='percentAccess').plot(kind='bar',figsize=(18,6), color=["#ACDDDE","#CAF1DE","#E1F8DC","#F7D8BA"],title="Figure 9. - Distribution of Black population concentration According to different locale")

In [None]:
befMarch = ans4[ans4['month']<4]
befMarch =befMarch.groupby('pctBlackHispanic').mean()
del befMarch['month']

In [None]:
afterAug = ans4[ans4['month']>7]
afterAug =afterAug.groupby('pctBlackHispanic').mean()
del afterAug['month']

In [None]:
ans43 = pd.merge(befMarch,afterAug, on=['pctBlackHispanic'], how ='left')
ans43.columns= ['befEngagement','befPercentAccess','befFreeReduced','befPupilExpenditure','aftEngagement','aftPercentAccess','aftFreeReduced','aftPupilExpenditure']
ans43['Engagement Index Change'] = (ans43['aftEngagement'] - ans43['befEngagement'] ) /  ans43['befEngagement']
ans43['Percent Access Change'] = (ans43['aftPercentAccess'] - ans43['befPercentAccess']) /  ans43['befPercentAccess']
ans43['Free Reduced Change'] = (ans43['aftFreeReduced'] - ans43['befFreeReduced']) /  ans43['befFreeReduced']
ans43['Pupil Expenditure Change'] = ( ans43['aftPupilExpenditure'] - ans43['befFreeReduced']) /  ans43['befPupilExpenditure']
ans43 = ans43[['Engagement Index Change', 'Percent Access Change','Free Reduced Change','Pupil Expenditure Change']]
ans43.head()

In [None]:
ans43.to_csv("ans4.csv", index=False)

### Answer4.
We can conclude from the above tables, Figure 8 and 9, that for ethnic black and hispanic people geographically living in predominantly Black people Locale is beneficial as they have more access and engagement, they enjoy more expenditure on online learning and freelunch also:

#### Hypothesis
  - Figure 8 Shows that locales where Black / hispanic ratio is 0.8,1.0 , has highest level of engagement amonst others, figure 9 shows us that population where black hispanic is 0.8,1.0 are moslty concentrated in cities, thus they benefit from better opportunities and infrastructure than non-city based communities.
  - Above Dataframe shows the change of different aspects in relation to black hispanic population concentration by comparing data of before and after lockdowns:
  <b>[0,0.2]</b> This populations density shows very less engagement after lockdowns, the percent access of this population even decliend, and free reduced price luch also shows a decline <b>Indicating</b>  Decline of states expenditure on black hispanic population of 0,0.2 concentration.
  <b>[0.8,1]</b> Predominantly city living population enjoy  high levels of all engagement index change, percent access, free reduced luch and pupil expenditure after lockdowns. Proving our hypothesis that <b> Areas with highest concentration of black.hispanic seem to get greater govt. expenditure and schooling support thus they're able to better overcome pandemic related challengers in learning.</b>

## Question 5.
#### Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

In [None]:
preLock = df1[df1['month']<3]
afterLock = df1[df1['month']>7]

In [None]:
def myPrelock(pre,post,col2,label):
    df1 = pre.groupby('state').mean()[col2]
    df1 = pd.DataFrame(df1)
    df1.reset_index(inplace=True)
    df1.columns = ['state','before']   
    df2 = post.groupby('state').mean()[col2]
    df2 = pd.DataFrame(df2)
    df2.reset_index(inplace=True)
    df2.columns = ['state','after']    
    df3 = pd.concat([df1, df2], axis=0, ignore_index=False)
    df3 = df3.groupby('state').sum()
    df3.reset_index(inplace=True)
    df3.plot(x="state", y=["before", "after"], kind="bar",figsize = (16,5), title=f"{label} Before and After Lockdowns",color=['#FFE7C7','#F491A9'])
    plt.grid()
    plt.xticks(rotation=80)

In [None]:
myPrelock(preLock,afterLock,'percentAccess',"Figure 11.- State Wise Percent Access")
myPrelock(preLock,afterLock,'engagementIndex',"Figure 12.- State Wise Engagement Index")
myPrelock(preLock,afterLock,'pupilExpenditure',"Figure 13.- State Wise Pupil Expenditure")
myPrelock(preLock,afterLock,'pctFreeReduced',"Figure 14.- State Wise Percent Free Reduced Lunch")

## Answer 5.

To answer the questions we'll can draw some findings from above charts and analyse whether if state interventions do make a difference or not in percent access and engagement access.<br><br>
<b>Findings:</b>
- In figure 13 & 14 States like new york show the highest increase in pupil expendture and free reduced lunch in lockdown times thus saw a increase in percent access and engagment of students
- While states like Wiscosin decreased there free reduced lunch thus seeing a direct impact on percent access which saw a decrease.
- Although there are few states likw Arizona, Connecticut, District of columbia which show a great growth in percent access and engagement but due to missing data of one of the aspect from pupil expenditure and free reuced lunch we can't properly form an hythothesis of the situation.
