In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from glob import glob 
import os 
import time 
from IPython.display import display 
import gc 
from wordcloud import WordCloud 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from tqdm import tqdm 
import scipy as sp 

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans 
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics.pairwise import cosine_similarity

import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset 

# Select Use columns 

In [None]:
%time 

districs = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv", usecols=["district_id", "state", "locale"])
product = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv", usecols=["LP ID", "Primary Essential Function"])
districs = districs.rename(columns={"district_id": "id"})
product = product.rename(columns={"LP ID": "lp id"})
engagement = pd.DataFrame()

count = 0 
for i, f in enumerate(glob("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/*.csv")):
    df = pd.read_csv(f)
    df["id"] = int(f.split("/")[-1].split(".")[0])
    engagement = pd.concat([engagement, df])
    count += 1 
    if count == 100:
        break

In [None]:
display(districs.isnull().sum().to_frame())
display(product.isnull().sum().to_frame())
display(engagement.isnull().sum().to_frame())

## merge dataframe 

In [None]:
%%time

df = pd.merge(districs, engagement, how="right", left_on="id", right_on="id")
df = pd.merge(df, product, how="left", left_on="lp_id", right_on="lp id")
df.drop(["lp id", "id", "lp_id"], axis=1, inplace=True)

del districs, product, engagement
gc.collect()
df.head()

In [None]:
print(df.shape)

In [None]:
%%time 

'''
時系列の型変換
休日とコロナ感染開始日のフラグ

main/sub 作成

'''

def split_essential_main(x):
    if type(x) != list:
        return "missing"
    else:
        return x[0]
    
def split_essential_sub(x):
    if type(x) != list or len(x) == 1:
        return "missing"
    else:
        return x[1]
        
# datetime 
df["time"] = pd.to_datetime(df.time)
df["week"] = df.time.dt.dayofweek 
df["holiday"] = df.week.apply(lambda x: 1 if x in [5, 6] else 0)
d = pd.date_range(start="2020-01-01", end="2020-01-19")
df["is_pandemic"] = df.time.apply(lambda x: 0 if x in d else 1)
df.drop("week", axis=1, inplace=True)

# primary essential functions 
df["Primary Essential Function"] = df["Primary Essential Function"].fillna("missing")
df["split"] = df["Primary Essential Function"].apply(lambda x: x.split("-"))
df["main"] = df.split.apply(split_essential_main)
df["sub"] = df.split.apply(split_essential_sub)
df.drop("split", axis=1, inplace=True)

gc.collect()
df.head()

# Value counts EDA.

### main and sub value counts.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
ax = axes.ravel()

main = df.main.value_counts()
ax[0].pie(x=main.values, labels=main.index)
ax[0].set_title("main counts")

sub = df["sub"].value_counts().to_frame()
sub.plot(kind="bar", ax=ax[1])
ax[1].set_title("sub counts")

del sub, main 
gc.collect()

plt.show()

### Sub count in a specific main 

In [None]:
def main_plot(df, n=10):
    main = df.loc[df.main != "missing", "main"].unique()
    fig, axes = plt.subplots(2, 2, figsize=(12, 12))
    ax = axes.ravel()
    
    for i, m in enumerate(main):
        x = df.loc[df.main == m, "sub"].value_counts().to_frame().sort_values("sub", ascending=False)[:n]
        x.plot(kind="bar", ax=ax[i])
        ax[i].set_title(f"main={m}")
    del main 
    plt.suptitle("main vs sub counts.", fontsize=16)
    plt.tight_layout()

main_plot(df)

### locale and statement value counts.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(22, 6))
ax = axes.ravel()

local = df.locale.value_counts()
ax[0].pie(x=local.values, labels=local.index, )
ax[0].set_title("locale value counts.")

state = df.state.value_counts().to_frame()
state.sort_values("state", ascending=False)[:10].plot(kind="bar", ax=ax[1])
ax[1].set_title("Top 10k statement counts.")

state.sort_values("state", ascending=False)[-10:].plot(kind="bar", ax=ax[2])
ax[2].set_title("Under 10k statement counts.")

del state, local 
gc.collect()
plt.tight_layout()

### How well developed cities can be categorized by a particular locale 

In [None]:
%%time 

def show_cloud(df):
    local = df.loc[df.locale != "missing", "locale"].unique()
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 12))
    ax = axes.ravel()
    for i, l in enumerate(local):
        x = df.loc[df.locale == l, ["state"]]
        if len(x) == 0: continue
        word = WordCloud(width=1500, height=1100, background_color="white", max_words=10).generate(" ".join(x["state"]))
        ax[i].imshow(word)
        ax[i].set_title(l)
        ax[i].set_xticks([])
        ax[i].set_yticks([])
    plt.tight_layout()
    
    
df["locale"] = df.locale.fillna("missing")
df["state"] = df.state.fillna("missing")
show_cloud(df)
gc.collect()

# Transition EDA 

In [None]:
def transition_all(df):
    time = df.groupby("time").mean().loc[:, ["pct_access", "engagement_index"]]
    
    fig, axes = plt.subplots(1, 2, figsize=(22, 6))
    ax = axes.ravel()
    time.drop("engagement_index", axis=1).plot(ax=ax[0])
    ax[0].set_title("pct_access")
    time.drop("pct_access", axis=1).plot(ax=ax[1])
    ax[1].set_title("engagement_index")
    plt.show()
    gc.collect()
    
    
def transition_locale(df, is_access=True):
    local = df.locale.unique()
    fig, axes = plt.subplots(1, 2, figsize=(22, 6))
    ax = axes.ravel()
    for l in local:
        x = df.loc[df.locale == l, ["time", "pct_access"]]
        y = df.loc[df.locale == l, ["time", "engagement_index"]]
        x.groupby("time").mean().plot(ax=ax[0])
        y.groupby("time").mean().plot(ax=ax[1])
    ax[0].legend(local)
    ax[1].legend(local)
    ax[0].set_title("access")
    ax[1].set_title("engagement")
    plt.show()
    gc.collect()
        
    
def transition_locale_access_trand(df):
    local = df.locale.unique()
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    ax = axes.ravel()
    for i, l in enumerate(local):
        x = df.loc[df.locale == l, ["time", "pct_access"]]
        x["rolling_7"] = x.groupby("time")["pct_access"].rolling(window=90).mean().reset_index(drop=True)
        x.groupby("time").mean().plot(ax=ax[i])
        ax[i].set_title(f"locale {l}")
        del x 
    plt.suptitle("locale classies pct_access trainsitin trends.",fontsize=18)
    plt.tight_layout()
    gc.collect()
    
    
def transition_locale_engage_trand(df):
    local = df.locale.unique()
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    ax = axes.ravel()
    for i, l in enumerate(local):
        x = df.loc[df.locale == l, ["time", "engagement_index"]]
        x["rolling_7"] = x.groupby("time")["engagement_index"].rolling(window=90).mean().reset_index(drop=True)
        x.groupby("time").mean().plot(ax=ax[i])
        ax[i].set_title(f"locale {l}")
        del x
    plt.suptitle("locale classies engagement_index trainsitin trends.", fontsize=18)
    plt.tight_layout()
    gc.collect()
    

### all locale and engagement and access

In [None]:
transition_all(df)

### Locale dupicated 

In [None]:
transition_locale(df)

### month vs engagement by holiday.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(22, 12))
ax = axes.ravel()

df["month"] = df.time.dt.month
sns.violinplot(data=df, x="month", y="engagement_index", hue="holiday", ax=ax[0])
ax[0].set_title("engagement")
sns.violinplot(data=df, x="month", y="pct_access", hue="holiday", ax=ax[1])
ax[1].set_title("pct_access")

df.drop("month", axis=1, inplace=True)
gc.collect()
plt.show()

### Locale vs engagement and poc_access transition.

In [None]:
transition_locale_access_trand(df)

In [None]:
transition_locale_engage_trand(df)

# 

### holiday and covid19 before after pandemic 

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(22, 10))
ax = axes.ravel()

sns.barplot(data=df, x="holiday", y="pct_access", ax=ax[0])
ax[0].set_title("is holiday use access rate")
gc.collect()

sns.barplot(data=df, x="holiday", y="engagement_index", ax=ax[1])
ax[1].set_title("is holiday use engagement_index rate")
gc.collect()

sns.barplot(data=df, x="is_pandemic", y="pct_access", ax=ax[2])
ax[2].set_title("before aftere covid19 access rate")
gc.collect()

sns.barplot(data=df, x="is_pandemic", y="engagement_index", ax=ax[3])
ax[2].set_title("before aftere covid19 engagement rate")
gc.collect()

plt.tight_layout()

On holidays and weekdays, weekdays are clearly larger. Surprisingly, there is less access before the pandemic due to the distribution before and after the corona.  This can be seen from the time series graph above, due to the fact that it was originally large in the city category. 

# Search for same state by sub 
The similarity is calculated by putting together the count tables from the states and the subs used based on them. That is, you can split states that belong to similar classification categories. It also searches for aggregates from states that belong to similar categories. 

In [None]:
%%time

'''
columns: state 
index: state 

used cosine calculate.
'''

state_sub_count_df = pd.crosstab(df.state, df["sub"])
s = MinMaxScaler(feature_range=(0.0, 1.0))
s_df = s.fit_transform(state_sub_count_df)

df_sparse = sp.sparse.csr_matrix(s_df)
df_sparse = cosine_similarity(df_sparse)
df_sparse = pd.DataFrame(df_sparse, columns=state_sub_count_df.index, index=state_sub_count_df.index)
del state_sub_count_df, s_df
gc.collect()
df_sparse.head()

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}



def find_similar_state(state_name: str, n: int=10):
    x = df_sparse[[state_name]].sort_values(state_name, ascending=False)[1:n+1]
    x.columns = ["similar"]
    return x 


def show_area(similar_state: pd.DataFrame, state_name: str):
    similar_state["state"] = similar_state.index
    similar_state = similar_state.reset_index(drop=True)
    similar_state["state_abbver"] = similar_state.state.replace(us_state_abbrev)
    
    fig = go.Figure()
    layout = dict(
        title_text = f"Search for similar ({state_name}) state top 10k",
        geo_scope='usa',
    )

    fig.add_trace(
        go.Choropleth(
            locations=similar_state.state_abbver,
            zmax=1,
            z = similar_state.similar,
            locationmode = 'USA-states', 
            marker_line_color='white',
            geo='geo',
            colorscale=px.colors.sequential.Teal, 
        )
    )

    fig.update_layout(layout)   
    fig.show()
    
    
def show_count_bar(similar_state: pd.DataFrame):
    state = similar_state.index.to_list()
    
    fig, axes = plt.subplots(1, 2, figsize=(22, 6))
    ax = axes.ravel()
    main = df.loc[df.state.isin(state), ["main"]].value_counts()
    ax[0].pie(x=main.values)
    ax[0].legend(main.index)
    sub = df.loc[df.state.isin(state), ["sub"]].value_counts().to_frame().sort_values("sub", ascending=False)[:5].sort_values("sub", ascending=True)
    sub.plot(kind="barh", ax=ax[1])
        
    ax[0].set_title("similar for main rate.")
    ax[1].set_title("similar for sub counts.")
    plt.tight_layout()
    del main, sub 
    gc.collect()
    
    
def show_transition(similar_state: pd.DataFrame, state_name):
#     similar_state = find_similar_state(state_name)
    state = similar_state.index.to_list()[:5]
    
    fig, axes = plt.subplots(2, 3, figsize=(22, 12))
    ax = axes.ravel()
    
    x = df.loc[df.state == state_name, ["time", "engagement_index"]]
    x.groupby("time").mean().plot(ax=ax[0])
    ax[0].set_title("current state transition.")
    
    for i, s in enumerate(state):
        x = df.loc[df.state == s, ["time", "engagement_index"]]
        x = x.groupby("time").mean()
        x.plot(ax=ax[i+1])
        ax[i+1].set_title(f"similaer state is {s}.")
        
    plt.tight_layout()
    del x 
    gc.collect()
    
    
def search_for_similar_plot(state_name: str, n: int=10):
    similar_df = find_similar_state(state_name, n)
    show_area(similar_df, state_name)
    show_count_bar(similar_df)
    show_transition(similar_df, state_name)
    display(similar_df)
    gc.collect()
    
    

In [None]:
%%time 

search_for_similar_plot("Wisconsin")

In [None]:
search_for_similar_plot('North Carolina')

The transition of group distribution by states with similar subs cannot be said to be very similar.
Rather, it seems to depend on the locale. 

# Predict tomorrow engagement_index	by LSTM model.

In [None]:
%%time 

x = df.groupby("time")["engagement_index", "pct_access"].sum()
x = x.rename(columns={"engagement_index": "engagement_index_lag_1", "pct_access": "pct_access_lag_1"})
x["engagement_index"] = x.engagement_index_lag_1.shift(-1)

for col in ["engagement_index_lag_1", "pct_access_lag_1"]:
    x[col.split("_")[0]+"_lag_2"] = x[col].shift(1).fillna(0)
    x[col.split("_")[0]+"_lag_3"] = x[col].shift(2).fillna(0)
    x[col.split("_")[0]+"_lag_30"] = x[col].shift(30).fillna(0)
    x[col.split("_")[0]+"_rolling7"] = x[col].rolling(window=7).mean().fillna(0).reset_index(drop=True)
    x[col.split("_")[0]+"_rolling30"] = x[col].rolling(window=30).mean().fillna(0).reset_index(drop=True)
x = x.fillna(0)
x.head()

In [None]:
class DigitalDataset(Dataset):
    def __init__(self, df):
        n_span = 30 
        train = df.iloc[:(-1)*(n_span+1), :]
        val = df.iloc[(-1)*(n_span+1): -1, :]
        test = df.iloc[(-1)*n_span:, :]
        self.train = []
        self.val = []
        self.test = []
        
        x_train, x_val, x_test = train.drop("engagement_index", axis=1), val.drop("engagement_index", axis=1), test.drop("engagement_index", axis=1)
        y_train, y_val = train[["engagement_index"]], val[["engagement_index"]]
        
        x_train, x_val, x_test = self._scaler(x_train, x_val, x_test)
        
        for i in range(x_train.shape[0]-n_span):
            input_data = {}
            inputs = x_train[i:i+n_span]
            inputs = torch.FloatTensor(inputs)
            target = y_train.iloc[i+n_span]
            target = torch.tensor(target, dtype=torch.float)
            
            input_data["inputs"] = inputs 
            input_data["target"] = target 
            self.train.append(input_data)
            
        for i in range(n_span):
            input_data = {}
            inputs_tr = x_train[(-1)*n_span+i:, :]
            inputs_va = x_val[:i, :]
            inputs = np.concatenate([inputs_tr, inputs_va])
            inputs = torch.FloatTensor(inputs)
            target = y_val.iloc[i]
            target = torch.tensor(target, dtype=torch.float)
            
            input_data["inputs"] = inputs 
            input_data["target"] = target 
            self.val.append(input_data)
            
        input_data = {"inputs": torch.FloatTensor(x_test)}
        self.test.append(input_data)
        
    def _scaler(self, tr, va, te):
        rs = RobustScaler()
        return rs.fit_transform(tr), rs.transform(va), rs.transform(te)
    

params = {
    "hidden_dim": 128, 
    "input_size": 12, 
}


config = {
    "device": "cuda:0" if torch.cuda.is_available() else "cpu", 
    "batch_size": 12, 
    "epoch": 1000, 
    "lr": 0.001
    
}
    
class DigitalModel(nn.Module):
    def __init__(self, input_size=params["input_size"], hidden_dim=params["hidden_dim"]):
        super(DigitalModel, self).__init__()
        self.hidden_dim = hidden_dim 
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim//2, 1)
        )
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :].view(-1, self.hidden_dim)
        x = self.fc(x)
        return x 
    

In [None]:
a = DigitalDataset(x)
print(f"input shape: {a.train[0]['inputs'].size()}")
print(f"target shape: {a.val[0]['target'].size()}")

net = DigitalModel()
a = torch.rand(2, 30, 12)
y = net(a)
print(f"output shape: {y.size()}")

In [None]:
def train_fn(dl, model, criterion, optimizer, is_train=True):
    total_loss = []
    if is_train:
        model.train()
    else:
        model.eval()
        
    for d in tqdm(dl):
        x = d["inputs"].to(config["device"])
        t = d["target"].to(config["device"])
        
        if is_train:
            y = model(x)
            loss = criterion(y.view(-1), t.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        else:
            with torch.no_grad():
                y = model(x)
                loss = criterion(y.view(-1), t.view(-1))
                
        total_loss.append(loss.item())
        del x, t
    total_loss = np.array(total_loss)
    return np.mean(total_loss)

def val_fn(dl, model):
    with torch.no_grad():
        pred = []
        for d in tqdm(dl):
            x = d["inputs"].to(config["device"])

            y = model(x)
            y = y.squeeze().detach().cpu().numpy()
            for yy in y:
                pred.append(yy)
            del x
    return pred

def test_fn(dl, model):
    with torch.no_grad():
        pred = []
        for d in tqdm(dl):
            x = d["inputs"].to(config["device"])

            y = model(x)
            y = y.squeeze().detach().cpu().numpy()
            pred.append(y.squeeze())
    return pred


def mae(pred, corr):
    return np.mean(np.abs(pred - corr))


In [None]:
def fit(train_dl, val_dl, debug=True):
    model = DigitalModel()
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    
    best_model, best_loss = None, np.inf 
    
    for e in range(1 if debug else config["epoch"]):
        ts = time.time()
        loss_tr = train_fn(train_dl, model, criterion, optimizer)
        val_tr = train_fn(val_dl, model, criterion, None, False)        
        
        if best_loss > val_tr:
            best_model = model 
            best_loss = val_tr
        now = time.time()
        print(f"epoch: {e+1} | tr loss: {loss_tr:.3f} | va loss: {val_tr:.3f} | dilation {now-ts}s | ")
    print(f"best val loss: {best_loss:.3f}")
    gc.collect()
    return best_model 

def predict(dl, model, is_test=False):
    if is_test:
        p = test_fn(dl, model)
    else:
        p = val_fn(dl, model)
    return p 


In [None]:
def main(df, debug):
    data = DigitalDataset(df)
    train, val, test = data.train, data.val, data.test 
    
    train_dl = DataLoader(train, 
                         batch_size=config["batch_size"],
                         shuffle=False, drop_last=False)
    val_dl = DataLoader(val, 
                         batch_size=config["batch_size"], 
                         shuffle=False, drop_last=False)
    test_dl = DataLoader(test, 
                         batch_size=1,
                         shuffle=False, drop_last=False)
    model = fit(train_dl, val_dl, debug)
    predv = predict(val_dl, model, False)
    predt = predict(test_dl, model, True)
    
    print("===================================================================================")
    print(f"validation mae: {mae(predv, df.iloc[-31: -1, :]['engagement_index'].values.ravel())}")
    print("===================================================================================")

    print(f"Expected to be {predt[0]} tomorrow ")
    print("===================================================================================")

    
if __name__ == "__main__":
    main(x, False)