In [None]:
import numpy as np 
import pandas as pd 
import math
import glob
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import wandb
import folium
from geopy.geocoders import Nominatim
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import plotly.graph_objs as go
from pandas_profiling import ProfileReport
pal = sns.color_palette()
from wordcloud import WordCloud
import plotly.express as px
from sklearn import preprocessing
import plotly.offline as py
import plotly.tools as tls
from keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from matplotlib import style
from datetime import date
import matplotlib.dates as mdates
import datetime as dt
plt.rcParams.update({'font.size': 14})
import re
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")

In [None]:
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

In [None]:
products_df.info()

In [None]:
msno.bar(products_df,color='#7209b7', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
# profile = ProfileReport( products_df, title='Pandas profiling report ' , html={'style':{'full_width':True}})
# profile.to_notebook_iframe()

In [None]:
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

In [None]:
districts_df.info()

In [None]:
# profile = ProfileReport( districts_df, title='Pandas profiling report ' , html={'style':{'full_width':True}})
# profile.to_notebook_iframe()

In [None]:
msno.bar(districts_df,color='#f72585', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

In [None]:
engagement_df.info()

In [None]:
# profile = ProfileReport( engagement_df, title='Pandas profiling report ' , html={'style':{'full_width':True}})
# profile.to_notebook_iframe()

In [None]:
msno.bar(engagement_df,color='#4895ef', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
#====== Function to create a dataframe of value counts ======
def count_values(df,col,top=False):
    df = pd.DataFrame(df[col].value_counts().reset_index().values,columns=[col, "counts"])
    if top==True: df=df[:10]
    return df

In [None]:
plt.figure(figsize=(16, 10))
sns.countplot(y="state",data=districts_df,order=districts_df.state.value_counts().index,palette="Blues",linewidth=3)
plt.title("State Distribution",font="Serif", size=20)
plt.show()

In [None]:
districts_df["state"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(10, 10), startangle=0)

In [None]:
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Locale Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05, 0.05)
labels = list(districts_df.locale.value_counts().index)
sizes = districts_df.locale.value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#d45d00","#ff9100","#eaaa00","#6d6875"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.countplot(y='Provider/Company Name', data=products_df, order=products_df["Provider/Company Name"].value_counts().index[:10])
plt.title("Top 10 Provider/Company Names",font="Serif", size=20)
plt.show()

In [None]:
labels = list(products_df['Sector(s)'].value_counts().index)
values = products_df['Sector(s)'].value_counts()
# colors = ['mediumslateblue', 'darkorange']
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values,hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict( line=dict(color='#000000', width=3)))
fig.update_layout(title="Sector Distribution ",
                  titlefont={'size': 30},      
                  )
fig.show()

In [None]:
c1=c2=c3=0
for s in products_df["Sector(s)"]:
    if(not pd.isnull(s)):
        s = s.split(";")
        for i in range(len(s)):
            sub = s[i].strip()
            if(sub == 'PreK-12'): c1+=1
            if(sub == 'Higher Ed'): c2+=1
            if(sub == 'Corporate'): c3+=1

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [c1,c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#ff228a","#20b1fd","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
primary_essential_main = []
primary_essential_sub = []
for s in products_df["Primary Essential Function"]:
    if(not pd.isnull(s)):
        s1 = s.split("-",1)[0].strip()
        primary_essential_main.append(s1)
    else:
        primary_essential_main.append(np.nan)
    
    if(not pd.isnull(s)):
        s2 = s.split("-",1)[1].strip()
        primary_essential_sub.append(s2)
    else:
        primary_essential_sub.append(np.nan)

products_df["primary_essential_main"] = primary_essential_main
products_df["primary_essential_sub"] = primary_essential_sub


In [None]:
c1=c2=c3=0

for s in products_df["primary_essential_main"]:
    if(not pd.isnull(s)):
        c1 += s.count("CM")
        c2 += s.count("LC")
        c3 += s.count("SDO")

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Primary Essential Function', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['CM','LC','SDO']
sizes = [c1, c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#18ff9f","#2cfbff","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
products_df["primary_essential_main"].value_counts().plot(kind = 'pie', autopct='%1.3f%%', figsize=(15, 15), startangle=0).legend()

In [None]:
# plt.figure(figsize=(16, 20))
# sns.countplot(y='primary_essential_sub', data=products_df, order=products_df["primary_essential_sub"].value_counts().index,color = pal2[6])
# plt.title("Primary Essential Function(Sub)",font="Serif", size=20)
# plt.show()

In [None]:
engagement_df['time'] = pd.to_datetime(engagement_df['time'])

In [None]:
print(products_df["LP ID"].nunique())
print(engagement_df["lp_id"].nunique())

In [None]:
products_engagement_data = pd.merge(products_df, engagement_df, left_on='LP ID', right_on='lp_id')
products_engagement_data.head()

In [None]:
print(districts_df["district_id"].nunique())
print(engagement_df["district_id"].nunique())

In [None]:
engagement_df["district_id"] = engagement_df["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_df, engagement_df, left_on='district_id', right_on='district_id')
districts_engagement_data.head()

In [None]:
ds = districts_df['locale'].value_counts().reset_index()
ds.columns = [
    'locale', 
    'percent'
]
ds['percent'] /= len(districts_df)

fig = px.pie(
    ds, 
    names='locale', 
    values='percent',
    color_discrete_sequence=px.colors.sequential.Mint,
    title='Occurrence of Locale in the District Information Data:', 
    width=700,
    height=500
)
fig.show()

In [None]:
# sns.set_style('darkgrid')
# fig,ax = plt.subplots()
# ax.axis('equal')

# plt.rcParams.update({'text.color' : "black",
#                      'axes.labelcolor' : "black"})
# plt.rcParams.update({'font.size': 35})

# cm, lc, other, sdo=[plt.cm.Blues, plt.cm.Reds, plt.cm.Greens, plt.cm.pink_r]

# ax.pie(gpef1['lp_id'],labels=gpef1.index,colors=[cm(0.6) , lc(0.6) , other(0.8) , sdo(0.6)], radius=10,explode=[0,0,0,0])
# ax.pie(gpef2['lp_id'],labels=gpef2.index,colors=[cm(0.2),cm(0.4),lc(0.1),lc(0.2),lc(0.3),lc(0.4),other(0.5),sdo(0.3),sdo(0.4)], autopct='%1.1f%%', radius=8)
                  
# plt.title("Google",fontdict={'fontsize':50})
# ax.legend(loc='lower right',bbox_to_anchor=(-2,1.5))
# plt.subplots_adjust(left=0.0, bottom=0.1, right=0.85)

In [None]:
def plot_time_series(df,col1,col2,col3):
    max_list = df[[col1,col2]]\
        .groupby([col1])[col2].mean()\
        .sort_values(ascending=False).index[:5].tolist()

    df = df[df[col1].isin(max_list)]\
                    .reset_index(drop=True)[[col3, col1, col2]]
    df = df.pivot_table(index=col3, columns=col1, values=col2)

    fig = px.line(df, facet_col=col1, facet_col_wrap=1, width=800, height=800)
    fig.update_layout(
                      title=(col1 + " , " + col2 + " , " + col3).title(),
                      title_x=0.39,
                      template="plotly_white",
                      paper_bgcolor='#f5f7f8',
                      font = {'family': 'Serif', 'size': 20}
                     )
    fig.show()

In [None]:
plot_time_series(districts_engagement_data,"state","pct_access","time")
plot_time_series(districts_engagement_data,"state","engagement_index","time")
plot_time_series(districts_engagement_data,"locale","pct_access","time")
plot_time_series(districts_engagement_data,"locale","engagement_index","time")

In [None]:
def append_engagement_data():
    '''Appends all engagement data into 1 big dataframe.
    district_id feature: to separate between districts
    return: full_df (~ 22mil rows)'''
    
    path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
    all_paths = glob.glob(f"{path}/*")
    all_dfs = []

    for path in all_paths:
        df = pd.read_csv(path)
        df["district_id"] = path.split("/")[-1].split(".")[0]
        all_dfs.append(df)
        
    full_df = pd.concat(all_dfs, axis=0)
    
    # Add time features

    # Holidays
    full_df["holiday"] = np.where(full_df["time"].isin(["2020-01-30", "2020-01-31"]), 1,
            np.where((full_df["time"]>="2020-04-05") & (full_df["time"]<="2020-04-09"), 1,
            np.where((full_df["time"]>="2020-06-12") & (full_df["time"]<="2020-09-10"), 1,
            np.where((full_df["time"]>="2020-11-25") & (full_df["time"]<="2020-11-29"), 1,
            np.where((full_df["time"]>="2020-12-21") & (full_df["time"]<="2020-12-31"), 1, 0)))))

    # Before/After Outbreak
    full_df["outbreak"] = np.where(full_df["time"]<="2020-03-24", 0, 1)
    
    return full_df

In [None]:
# # Import data
df = append_engagement_data()
#districts = clean_districts()
prods = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

n = 17
top_n = df.groupby(["lp_id"])["engagement_index"].mean().reset_index().\
            sort_values("engagement_index", ascending=False).\
            head(n)["lp_id"].values

# Transform the data
timeprod = df.groupby(["time", "lp_id"])["engagement_index"].median().\
                reset_index()
timeprod = pd.merge(timeprod, prods[["LP ID", "Product Name"]], 
                     left_on="lp_id", right_on="LP ID")
timeprod = timeprod[timeprod["lp_id"].isin(top_n)]

timeprod = pd.pivot(timeprod.drop(columns=["lp_id", "LP ID"]), index="time",
                     columns="Product Name", values="engagement_index").\
                    reset_index().fillna(0)

# Log scale for better visual
for c in timeprod.columns[1:]:
    timeprod[c] = np.log10(timeprod[c]).\
                    replace([np.inf, -np.inf], 0)
    timeprod[c] = timeprod[c]*100
    
# Convert from string to number
string_date = timeprod["time"]
timeprod.insert(0, 'time2', string_date)
timeprod["time"] = pd.to_datetime(timeprod["time"]).apply(lambda x: x.value)
for col in timeprod.columns[1:]: 
    timeprod[col] = timeprod[col].astype("int64")
    
# # Save to .csv
timeprod.to_csv("timeprod.csv", index=False)

In [None]:
d222=pd.read_csv("./timeprod.csv")
d222.head()

In [None]:
profile = ProfileReport( d222, title='Pandas profiling report ' , html={'style':{'full_width':True}})
profile.to_notebook_iframe()

In [None]:
d222.columns

In [None]:
# #import matplotlib as mpl
# cols=['Canvas', 'ClassLink', 'Google Classroom',
#        'Google Docs', 'Google Drive', 'Google Forms', 'Kahoot!', 'Meet',
#        'Schoology', 'YouTube']
# for i in cols:
#     plt.plot(d222['time'], d222[i], color = 'blue')
#     plt.title('Top 10 Educational Products Median Daily Student Engagement (log10 scale)')
#     plt.xlabel("time")
#     plt.ylabel("page loaded by a student")
#     plt.rcParams['figure.figsize'] = [200, 200]
#     plt.rcParams['xtick.labelsize'] = 30
#     plt.rcParams['ytick.labelsize'] = 30
#     plt.show()

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join(districts_df['state'].astype(str)))
plt.figure(figsize=(15, 10))
plt.imshow(cloud)
plt.axis('off')

In [None]:
labels = list(districts_df.state.value_counts().index)
values = districts_df['state'].value_counts()
# colors = ['mediumslateblue', 'darkorange']
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values,hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict( line=dict(color='#000000', width=3)))
fig.update_layout(title="State Distribution ",
                  titlefont={'size': 30},      
                  )
fig.show()

In [None]:
geolocator = Nominatim(user_agent="Ruch")

def feature_generation(df):
    lat=[]
    long=[]
    for i in df['state']: 
        location = geolocator.geocode(i)
        try:
            lat.append(location.latitude)
            long.append(location.longitude)
        except:
            lat.append("NA")
            long.append("NA")
    df['Latitude'] = lat
    df['Longitude'] = long
    
    return df

def map_df(df,col1,col2):
    df = pd.DataFrame(df[[col1,col2]]\
            .groupby([col1])[col2].mean()\
            .sort_values(ascending=False)[:20]).reset_index()
    
    df = feature_generation(df)
    
    return df

state_access = map_df(districts_engagement_data, "state", "pct_access")
state_engagement_index = map_df(districts_engagement_data, "state", "engagement_index")

In [None]:
north_america_map = folium.Map(location=[38.9, -77.05], tiles='Stamen Watercolor', zoom_start=3)
north_america_map

In [None]:
mc = MarkerCluster()
for idx, row in state_access.iterrows():
    if not math.isnan(row['Longitude']) and not math.isnan(row['Latitude']):
        popup = """
        State : <b>%s</b><br>
        Percentage Access : <b>%s</b><br>
        """ % (row['state'], row['pct_access'])
        mc.add_child(Marker([row['Latitude'], row['Longitude']],tooltip=popup))
    north_america_map.add_child(mc)
north_america_map

In [None]:
mc = MarkerCluster()
for idx, row in state_engagement_index.iterrows():
    if not math.isnan(row['Longitude']) and not math.isnan(row['Latitude']):
        popup = """
        State : <b>%s</b><br>
        Engagement Index : <b>%s</b><br>
        """ % (row['state'], row['engagement_index'])
        mc.add_child(Marker([row['Latitude'], row['Longitude']],tooltip=popup))
    north_america_map.add_child(mc)
north_america_map

In [None]:
from sklearn.impute import SimpleImputer
df_most_frequent = df.copy()
#setting strategy to 'mean' to impute by the mean
mean_imputer = SimpleImputer(strategy='most_frequent')# strategy can also be mean or median 
df_most_frequent.iloc[:,:] = mean_imputer.fit_transform(df_most_frequent)
df_most_frequent.isnull().sum()

In [None]:
from fbprophet import Prophet
df1=df_most_frequent.rename(columns={"time": "ds", "engagement_index": "y"})
df1
m = Prophet()
m.fit(df1)

In [None]:
future = m.make_future_dataframe(periods=365)
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
fig1 = m.plot(forecast)
plt.xticks(rotation=45);

In [None]:
fig2 = m.plot_components(forecast, figsize=(20, 12))

In [None]:
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()

fig = plot_plotly(m, forecast)  # This returns a plotly Figure
py.iplot(fig)

In [None]:
_ = pd.pivot_table(df, values='engagement_index', index='time').plot(style='-o', title="Learning Engagement in Pandemics")
plt.xticks(rotation=45);