In [None]:

import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Introduction to our Data

## Covid-19 pandemic had affected millions of people worldwide espicially it hit the Education system during the first wave many countries have decided to shut down school and switch to E-learning rather than Face to Face learning.
> In this notebook we will analyze the engagment of students in the US with various types of E-learning provided by various companies like Google etc
The data consists of more than (300 districts in different states ,products and its company (Provider) ,engagement index, ethnic groups) and more
WE began with data wrangling for null & duplicate values and to set the appropriate data types for each column
After that we did EDA (Exploratory Data Analysis) to search for any insights and to have a bigger picture before we dive into more details then we focused in the products and their relations with students' engagement district etc, and we did the same for the district
Finally we did a geomap to see which areas have engaged more for E-learning

# Packages to import

In [None]:
import seaborn as sb
import os
import matplotlib.dates as mdates
import seaborn as snb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import glob
from wordcloud import WordCloud
from distutils.version import LooseVersion
import folium
from geopy.geocoders import Nominatim
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster, FastMarkerCluster
%matplotlib inline

# Wrangling data

## combining the data

In [None]:
path =  "/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
last_csv = "new_combined_csv.csv"

def get_csv_file(path, last_csv):
    os.chdir(path)
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    li = []
    for f in all_filenames:
#         print(f)
        df = pd.read_csv(f)
        district_id = f.split(".")[0]
        df["district_id"] = district_id
        li.append(df)
#combine all files in the list
    combined_csv = pd.concat(li)
    combined_csv = combined_csv.reset_index(drop=True)
    return combined_csv

df_tmp = get_csv_file(path, last_csv)

In [None]:
df_tmp

In [None]:
# checking for duplicates
df_tmp.duplicated().sum()

> Zero duplicates

In [None]:
# checking for null values
df_tmp.isnull().sum()

> Will drop the zero records from(lp_id) , but the other two fetures looks interisting let's check them

In [None]:
df_tmp.describe()

> many of the (engagement_index) that has nan values has zero value in (pct_access) and they depends on each other, so i'll fill the nan values in both sides with zero which mean no student used this product in that exact day from the 1000 students.

In [None]:
df_tmp['engagement_index'] = df_tmp['engagement_index'].fillna(0)
df_tmp['pct_access'] = df_tmp['pct_access'].fillna(0)
df_tmp.isnull().sum()

> Now i'll drop the lp_id null values

In [None]:
df_tmp.dropna(inplace=True)
df_tmp.isnull().sum()

> Now all done and cleaned

# Exploring some features

In [None]:
df_tmp.info()

> Will change (lp_id) into object to use it as categorial featre

In [None]:
df_tmp['lp_id'] = df_tmp['lp_id'].astype(int).astype(str)

In [None]:
df_tmp.info()

> now let's check that ids

In [None]:
df_tmp['lp_id'].value_counts()

> They are 8646 product ids

> Let's check how often these products are used according to time

In [None]:
df_tmp['time'].value_counts()

> I have 8646 lp_id that have been used many times in 366 dates, what avout counting how often each id had been used in each date? 

In [None]:
lp_id_count_time = df_tmp.groupby(['lp_id', 'time']).size().reset_index(name='lp_id_count')
lp_id_count_time = lp_id_count_time.sort_values(["lp_id_count"], ascending=False)
lp_id_count_time

> Interesting gotta check that too

> Will make a copy from the datafram

In [None]:
df2 = df_tmp.copy()

### Now will extract month,day and year from time

In [None]:
def get_day_month_year (df):
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = (df.time.dt.strftime('%Y')).astype('int')
    df['day'] = df.time.dt.day
    df['month'] = df.time.dt.strftime('%B')
    return df
get_day_month_year(df2)

# Exploring Data

In [None]:
df2['month'].value_counts()

> It seems that the products had been used all over the year with high rate due to absence of school, which makes students affraid of not gaining enogh education. but also obvious that (oct., sep., nov., dec., and apr.) has the most traffic.  

> now let's check each month for each id

In [None]:
lp_id_count_mnth = df2.groupby(['lp_id', 'month']).size().reset_index(name='count')
lp_id_count_mnth = lp_id_count_mnth.sort_values(["count"], ascending=False)
lp_id_count_mnth

In [None]:
# df2['district_id'] = df2['district_id'].apply(lambda x: str(x))

> Making a list of the months in dataframe

In [None]:
month_lst = list(set(df2['month']))

> Looping over the list to check it gives the right output

In [None]:
for i in month_lst:
    print(i)

> Highest Engagement index in per month

In [None]:
for i in month_lst:
    tmp = df2[df2['month'] == i]
    tmp = tmp.sort_values(by=['engagement_index'],ascending = False)[:20]
    sns.displot(tmp, x=tmp['engagement_index'], hue=tmp['district_id'], kind="kde")
    plt.xlabel("District id")
    plt.ylabel("count")
    plt.title(f"Highest Engagement index in {i}")
plt.show();

In [None]:
for i in month_lst:
    tmp = df2[df2['month'] == i]
    tmp = tmp.sort_values(by=['pct_access'],ascending = False)[:20]
    sns.displot(tmp, x=tmp['pct_access'], hue=tmp['district_id'], kind="kde")
    plt.xlabel("District id")
    plt.ylabel("count")
    plt.title(f"Highest pct_access in {i}")
plt.show();

> We can get the district_ids with highest access, and engagement index from those graphs

In [None]:
plt.figure(figsize=[20, 15]);
df2['month'].value_counts().plot(x="month", y="count", kind="bar", rot=50, fontsize=35);

## Aggreagtion of the data by date grouping with day and month

In [None]:
df2_aggregated = df2.groupby(['time','month','day']).agg({'engagement_index':np.mean,'pct_access':np.mean})
df2_aggregated = df2_aggregated.reset_index()
df2_aggregated

In [None]:
plt.figure(figsize=(15,11))
sns.lineplot(y=df2_aggregated['engagement_index'],x=df2_aggregated["month"],palette='rocket')
plt.title("Average engagement_index each month");

In [None]:
plt.figure(figsize=(15,11))
sns.lineplot(y=df2_aggregated['pct_access'],x=df2_aggregated["month"],palette='rocket')
plt.title("Average pct_access each month");

> As expected percentage of access is equivellant to engagement_index, both decrease in jul.

In [None]:
plt.figure(figsize=(15,10));
sns.lineplot(y=df2_aggregated['engagement_index'],x=df2_aggregated["day"]);
plt.title("Average engagement_index each day");

> Seems that in each 5 days (almost a week) the access to product increases gradually till half of the period, then decreases again, sure due to weekedend

# Reading products_info data

# Wrangling & Cleaning

In [None]:
df_product = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
df_product

## Checking null values

In [None]:
df_product.isnull().sum()

## Cleaning null values

In [None]:
df_product.dropna(inplace=True)

In [None]:
df_product.isnull().sum()

### Renaming (LP ID)

In [None]:
df_product.rename(columns={'LP ID':'lp_id'}, inplace=True)

## Changing type of lp_id into object

In [None]:
df_product['lp_id'] = df_product['lp_id'].astype(int).astype(str)

## Checking providers count

In [None]:
df_product['Provider/Company Name'].value_counts()

## Checking sectors count

In [None]:
df_product['Sector(s)'].value_counts()

## Checking Primary Essential Function count

In [None]:
df_product['Primary Essential Function'].value_counts()

In [None]:
df_product.info()

## Splitting Primary Essential Function into two more clear features

In [None]:
primary_essential_abbv = []
Primary_Essential_Function = []
for i in df_product['Primary Essential Function']:
    primary_essential_abbv.append(i.split('-',1)[0].strip())
    Primary_Essential_Function.append(i.split('-',1)[1].strip())
df_product['primary_essential_abbv']  = primary_essential_abbv
df_product['Primary_Essential_Function'] = Primary_Essential_Function
df_product

In [None]:
df_product['Primary_Essential_Function'].value_counts()

## Cleaning the Primary Essential Function may be i could use it in text analysis

In [None]:
df_product['Primary_Essential_Function'] = df_product['Primary_Essential_Function'].str.replace(",","").str.replace("-","").str.replace("&","").str.replace("(","").str.replace(")","")

In [None]:
df_product['Primary_Essential_Function'].value_counts()

## Checking Sectors(s) count after making it more clean

In [None]:
df_product['Sector(s)'] = df_product['Sector(s)'].str.replace("; ","_").str.replace(" ","_")

In [None]:
df_product['Sector(s)'].value_counts()

## Dropping un necessary columns

In [None]:
df_product.drop(['URL','Primary Essential Function'], axis=1, inplace=True)
df_product

# Exploring this data

## estimation of no. of top Companies

In [None]:
print("estimation of no. of Provider/Company Name")
o = df_product.groupby('Provider/Company Name').size().reset_index(name='count')
o = o[o['count'] > 2]
o = o.sort_values(["count"], ascending=False)
o

In [None]:
plt.figure(figsize=[8, 10]);
o.plot(x="Provider/Company Name", y="count", kind="bar", rot=90, fontsize=10);
plt.xlabel('estimation of no. of Provider/Company Name', fontsize=20);
plt.show();

> Google LLC company has the highest share

## How Primary_Essential_Function and primary_essential_abbv affects the data

In [None]:
plt.figure(figsize=[15, 10]);
sb.countplot(data = df_product, x='Primary_Essential_Function', hue='primary_essential_abbv')
plt.xticks(rotation=90);
plt.xlabel('Distribution of Primary_Essential_Function and primary_essential_abbv', fontsize=20);

## Estimating Primary_Essential_Function for each abbreviation

In [None]:
df_product['primary_essential_abbv'].value_counts()

> Clean the LC/CM/SDO category 

In [None]:
df_product['primary_essential_abbv'] = df_product['primary_essential_abbv'].str.replace("/","_")

### Filtering dataframe with "LC"

In [None]:
df_product_LC = df_product[df_product['primary_essential_abbv'] == 'LC']

In [None]:
plt.figure(figsize=[15, 10]);
sb.countplot(data = df_product_LC, x='Primary_Essential_Function', hue='primary_essential_abbv')
plt.xticks(rotation=90);
plt.xlabel('Distribution of Primary_Essential_Function in LC', fontsize=20);

### Estimation for "CM"

In [None]:
df_product_CM = df_product[df_product['primary_essential_abbv'] == 'CM']
plt.figure(figsize=[15, 10]);
sb.countplot(data = df_product_CM, x='Primary_Essential_Function', hue='primary_essential_abbv')
plt.xticks(rotation=90);
plt.xlabel('Distribution of Primary_Essential_Function in CM', fontsize=20);

### Estimation for "SDO"

In [None]:
df_product_SDO = df_product[df_product['primary_essential_abbv'] == 'SDO']
plt.figure(figsize=[15, 10]);
sb.countplot(data = df_product_SDO, x='Primary_Essential_Function', hue='primary_essential_abbv')
plt.xticks(rotation=90);
plt.xlabel('Distribution of Primary_Essential_Function in SDO', fontsize=20);

In [None]:
df_product['primary_essential_abbv'].value_counts()

### Estimation for "LC/CM/SDO"

In [None]:
df_product_LC_CM_SDO = df_product[df_product['primary_essential_abbv'] == 'LC_CM_SDO']
plt.figure(figsize=[15, 10]);
sb.countplot(data = df_product_LC_CM_SDO, x='Primary_Essential_Function', hue='primary_essential_abbv')
plt.xticks(rotation=90);
plt.xlabel('Distribution of Primary_Essential_Function in LC_CM_SDO', fontsize=20);

> Interisting...!!!

In [None]:
df_product[df_product['primary_essential_abbv'] == 'LC_CM_SDO']

> All "LC_CM_SDO" are all "other" function

> Let's estimate percentage of each abbv.

In [None]:
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('primary_essential_abbv distribution', size = 20)
explode = (0.05, 0.05, 0.05, 0.05)
labels = list(df_product['primary_essential_abbv'].value_counts().index)
sizes = df_product['primary_essential_abbv'].value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#FFFF33","#ff9100","#eaaa00","#6d6875"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show();

> Let's analyze the most used Primary_Essential_Function through word cloud

In [None]:
cloud_Primary_Essential_Function = WordCloud(width=1440, height=1080).generate(" ".join(df_product['Primary_Essential_Function'].astype(str)))
plt.figure(figsize=(15, 10))
plt.imshow(cloud_Primary_Essential_Function)
plt.axis('off')

> The same here, check for most used product name used

In [None]:
cloud = WordCloud(width=1440, height=1080).generate(" ".join(df_product['Product Name'].astype(str)))
plt.figure(figsize=(15, 10))
plt.imshow(cloud)
plt.axis('off')

> Let's check for the distribution of percentge of different Sector(S)

In [None]:
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector(s) distribution', size = 20)
explode = (0.05, 0.05, 0.05, 0.05, 0.05)
labels = list(df_product['Sector(s)'].value_counts().index)
sizes = df_product['Sector(s)'].value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=1.8, colors=["#F3CFC6","#FF00FF","#FFB6C1","#FF69B4", "#FF0000"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show();

# Reading district data

## Wrangling and cleaning

In [None]:
df_district = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
df_district

## Checking null values

In [None]:
# Dropping the values from the states column which are in district dataframe
df_district = df_district[df_district['state'].notna()].reset_index(drop=True)

In [None]:
df_district.shape

In [None]:
df_district.isnull().sum()

## Merging data

In [None]:
df3 = pd.merge(df2,df_product)
df3.head()

## Changing type of "districy_id"

In [None]:
df_district['district_id'] = df_district['district_id'].astype(int).astype(str)

## Merging district data

In [None]:
df4 = pd.merge(df3,df_district)
df4.head()

In [None]:
df4.info()

In [None]:
df4.describe()

## Generating geo map for count of states using online products this year

## The most states used online products frequently.

In [None]:
locations=pd.DataFrame({"Name":df_district['state'].unique()})

In [None]:
geolocator=Nominatim(user_agent="app")
lat=[]
lon=[]
for location in locations['Name']:
    location = geolocator.geocode(location)    
    if location is None:
        lat.append(np.nan)
        lon.append(np.nan)
    else:
        lat.append(location.latitude)
        lon.append(location.longitude)
locations['lat']=lat
locations['lon']=lon

In [None]:
st_locations = pd.DataFrame(df_district['state'].value_counts().reset_index())
st_locations

In [None]:
st_locations.columns=['Name','count']
final_loc = st_locations.merge(locations,on='Name',how="left").dropna()
final_loc.head(15).style.set_caption("Locations Dataframe").set_properties(**{'background-color': 'cyan',
                           'color': 'darkblue','border': '1.5px solid black'})

In [None]:
def generateBaseMap(default_location=[37.0902, -95.7129], default_zoom_start=4):
    base_map = folium.Map(location=default_location, zoom_start=default_zoom_start)
    return base_map

st_map = generateBaseMap()
HeatMap(final_loc[['lat','lon','count']],zoom=20,radius=20).add_to(st_map)
st_map

> here we can see that top states frequently used online products for ex, Connecticut, Utah,and Illinois,..etc.

## Geo map for the states with the highest pct_access to online products

In [None]:
pct_company_state = df4.groupby(['state','Provider/Company Name'],as_index=False)['pct_access'].sum()
pct_company_state.columns=['Name','Provider/Company Name','pct_access']
# pct_company_state
pct_company_state = pct_company_state.sort_values('pct_access',ascending=False)
pct_company_state.sort_values('pct_access',ascending=False).iloc[0:10].plot(x='Name',y='pct_access',kind='bar')

> Gives quiet similar results as the most frquent use from states

In [None]:
pcc_comp_loc = pct_company_state.merge(locations,on='Name',how="left").dropna()
pcc_comp_loc.head(15).style.set_caption("Locations Dataframe").set_properties(**{'background-color': 'cyan',
                           'color': 'darkblue','border': '1.5px solid black'})

In [None]:
st_map = generateBaseMap()
HeatMap(pcc_comp_loc[['lat','lon','pct_access']],zoom=20,radius=20).add_to(st_map)
st_map

> As we can see, also Connecticut are in the top then Illinois and Massachusetts!

## Investigating the most used products from companies according to the sum of pct_access not the frequent use.

In [None]:
best_companies=df4.groupby(['Provider/Company Name'],as_index=False)['pct_access'].sum()
best_companies.sort_values('pct_access',ascending=False).iloc[0:10].plot(x='Provider/Company Name',y='pct_access',kind='bar')

> By investigating the top used companies which contributed in production of E_learning products, according to the pct_access we found simple variation than the same investigation according to the count of that products through the data, still "Google LLC" in top, but after that a little bit variations as shown in the upper fig.

# Summary

> engagment index starts to increase from january with start of 60, to reach peak (160) in middle of april then start to decrease to hit its bottom(20) in july, thin hit the peak again in october.
the least 3 months are the months of holiday (june july august), while the hieghst are in middle of first semester and the beggining of the 2nd semester
the peak(6.5) of the percentage of students using the products are in february and october and hits bottom (.1) in july
this tell us that largest number of students are in the beggining of each semester (february and october) and least - logically - in the holiday months (july june august)
from this  we know that most of students are excited to use these products in the beggining of each semester, however their need to use it are more in middle of the 2nd  semester and the beggining of the 1st semester, which indicates that a lot of students who were excited to use these products, don't acually know how to make benifit of it in the time of need.
> Also we investigated the top states frequently used online products, and those with the highest pcc_access