Credits to [Alexander Ryzhkov](https://www.kaggle.com/alexryzhkov) for his [great lightAutoMl notebook](https://www.kaggle.com/alexryzhkov/tps-lightautoml-baseline-with-pseudolabels)!

# **L**ong **S**hort **T**erm **M**emory

In this notebook, following an exploratory data analysis, I show how to fit an LSTM.

<img height="700" width="700" src="https://www.researchgate.net/profile/Juan_Victores/publication/334360853/figure/fig1/AS:778955447599106@1562728859405/The-LSTM-cell-internals.png">

<p style="font-size : 12px"><em>Image from: <a href="https://www.researchgate.net/profile/Juan_Victores/publication/334360853/figure/fig1/AS:778955447599106@1562728859405/The-LSTM-cell-internals.png">here</a></em></p>

In [None]:
import numpy as np
import pandas as pd 
!pip install openpyxl 
from pandas_profiling import ProfileReport as PR
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import seaborn as sns

from statsmodels.tsa.seasonal import seasonal_decompose

import tensorflow.keras as ks
from tensorflow.keras import optimizers
import tensorflow as tf

from functools import partial
from scipy.special import erfinv

import datetime

import warnings
warnings.filterwarnings("ignore")

reading the data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
targetCols=[i for i in train.columns if "target" in i]
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
pslb = pd.read_csv("../input/bestsubtps202106/lightautoml_with_pseudolabelling_kernel_version_15.csv")
pslbLEAK = pd.read_excel('../input/air-quality-time-series-data-uci/AirQualityUCI.xlsx', sheet_name=0); pslbLEAK=pslbLEAK.iloc[7110:].reset_index(drop = True)
pslbLEAK.rename({'CO(GT)': 'target_carbon_monoxide', 'C6H6(GT)': 'target_benzene', 'NOx(GT)': 'target_nitrogen_oxides'}, axis = 1, inplace = True)
samSub = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
for t in targetCols:
    pslb[t] = np.where(pslbLEAK[t] >= 0, pslbLEAK[t], pslb[t])

In [None]:
train["date_time"] = train.date_time.astype("datetime64")
test["date_time"] = test.date_time.astype("datetime64")
pslb["date_time"] = pslb.date_time.astype("datetime64")

# Profiling

We start by looking at the data set:

In [None]:
profile = PR(train, title="TPS Train Dataset Report", dark_mode=True, progress_bar=False)

In [None]:
profile.to_widgets()

# Visual Inspection

The individual features of the parcoords can be moved to left or right by holding the topic. If you hold the left mouse button, you can mark areas on the vertical lines.

In [None]:
parCo = go.Figure(data=
    go.Parcoords(
        line = dict(color = train.sample(frac=0.2,random_state=1)['target_carbon_monoxide'],
                    colorscale = 'Tealrose',
                    showscale = True,
                    reversescale = True),
        dimensions = list([
            dict(label = "deg C", 
                 values = train.sample(frac=0.2,random_state=1)['deg_C']),
            dict(label = 'rel. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['relative_humidity']),
            dict(label = 'abs. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['absolute_humidity']),
            dict(label = 'sensor 1', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_1']),
            dict(label = 'sensor 2', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_2']),
            dict(label = 'sensor 3', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_3']),
            dict(label = 'sensor 4.', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_4'])
        ])
    )
)

parCo.update_layout(
    title="Colored by target carbon monoxide"
)

parCo.show()

In [None]:
parCo = go.Figure(data=
    go.Parcoords(
        line = dict(color = train.sample(frac=0.2,random_state=1)['target_benzene'],
                    colorscale = 'Tealrose',
                    showscale = True,
                    
                    
                    reversescale = True),
        dimensions = list([
            dict(label = "deg C", 
                 values = train.sample(frac=0.2,random_state=1)['deg_C']),
            dict(label = 'rel. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['relative_humidity']),
            dict(label = 'abs. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['absolute_humidity']),
            dict(label = 'sensor 1', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_1']),
            dict(label = 'sensor 2', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_2']),
            dict(label = 'sensor 3', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_3']),
            dict(label = 'sensor 4.', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_4'])
        ])
    )
)

parCo.update_layout(
    title="Colored by target benzene"
)

parCo.show()

In [None]:
parCo = go.Figure(data=
    go.Parcoords(
        line = dict(color = train.sample(frac=0.2,random_state=1)['target_nitrogen_oxides'],
                    colorscale = 'Tealrose',
                    showscale = True,
                    
                    
                    reversescale = True),
        dimensions = list([
            dict(label = "deg C", 
                 values = train.sample(frac=0.2,random_state=1)['deg_C']),
            dict(label = 'rel. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['relative_humidity']),
            dict(label = 'abs. humidity', 
                 values = train.sample(frac=0.2,random_state=1)['absolute_humidity']),
            dict(label = 'sensor 1', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_1']),
            dict(label = 'sensor 2', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_2']),
            dict(label = 'sensor 3', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_3']),
            dict(label = 'sensor 4.', 
                 values = train.sample(frac=0.2,random_state=1)['sensor_4'])
        ])
    )
)

parCo.update_layout(
    title="Colored by target nitrogen oxides"
)

parCo.show()

### Paitplot

In [None]:
sns.set_theme(style="ticks")

sns.pairplot(
    train.sample(frac=0.2).iloc[:,1:], 
    corner=True, 
    kind="hist", 
    diag_kind="kde",
    aspect=1,
    height=2.5
    )

In [None]:
def timeViz(DF, DFName : str, target : str, perc=0.2):

    r = np.random.uniform(size=DF.shape[0])
    vizDf = DF.loc[r < perc]
    features = DF.columns[1:9]

    fig = make_subplots(rows=4, cols=2, subplot_titles=(features))

    r=1
    c=1

    for f in features:
        fig.add_trace(
            go.Scatter(
                x=vizDf.date_time, 
                y=vizDf[f"{f}"], 
                mode='markers',
                showlegend=False,
                hovertemplate =
                '<br>%{x}'+
                '<br>%{y}'+
                '<br><b>target</b>:%{text}',
                text = ['{}'.format(i) for i in vizDf[f"{target}"]],
                name='Time Seties',
                 marker=dict(
                    size=6,
                    color=vizDf[f"{target}"], 
                    colorscale='Tealrose', 
                    showscale=True
                    )
                ),
            row=r, col=c
        )

        if c == 2:
            c = 0
            r+=1
        c+=1


    fig.update_layout(height=800, width=800, title_text= DFName + "-Features Colored by " + target, template="simple_white")
    fig.show()

### Time Series Visualization

In [None]:
timeViz(train, "train", "target_carbon_monoxide")

In [None]:
timeViz(train, "train", "target_benzene")

In [None]:
timeViz(train, "train", "target_nitrogen_oxides")

In [None]:
def SaisonalComponents(DF1, DF2, periods):
    DF12 = pd.concat([DF1.iloc[:,:9], DF2.iloc[:,:9]])
    DF = DF12.copy()
    for i in DF12.columns[1:]:
        result = seasonal_decompose(DF12[f"{i}"], model='additive', period=periods)
        DF12[f"Season_{i}"] = result.seasonal
        DF12[f"Season_{i}"] = DF12[f"Season_{i}"].fillna(DF12[f"Season_{i}"].mean())
        #result.plot()
    return DF, DF12

In [None]:
gap=24
Orig, SeasAdj = SaisonalComponents(train, test, gap)

In [None]:
def timeVizSeason(DF, DFName : str, addText : str, perc=0.2):

    r = np.random.uniform(size=DF.shape[0])
    vizDf = DF.loc[r < perc]
    features = DF.columns[1:9]

    fig = make_subplots(rows=4, cols=2, subplot_titles=(features))

    r=1
    c=1

    for f in features:
        fig.add_trace(
            go.Scatter(
                x=vizDf.date_time, 
                y=vizDf[f"{f}"], 
                showlegend=False,
                mode="lines",
                hovertemplate =
                '<br>%{x}'+
                '<br>%{y}',
                name='Time Series',
                line_color="black"
                ),
            row=r, col=c
        )
        fig.add_vline(x="2011-01-01 00:00:00", line_width=3, line_dash="dash", line_color="red")

        if c == 2:
            c = 0
            r+=1
        c+=1


    fig.update_layout(height=800, width=800, title_text= DFName + "-Features " + addText, template="simple_white")
    fig.show()

# Seasonal Components

In [None]:
timeVizSeason(Orig, "train+test", "Season + Trend + Residual")

In [None]:
cols = [i for i in range(9,17)]
cols.insert(0, 0)

timeVizSeason(SeasAdj.iloc[:,cols], "train+test", "Seasonal Component")

In [None]:
def FeatEng(DF, Vars, lags=[6]):
    
    DF=DF.copy()
    
    DF["weekday"] = np.sin(DF.date_time.dt.weekday / 7 * np.pi/2)
    DF["hour"] = np.sin(DF.date_time.dt.hour / 24 * np.pi)
    DF["working_hours"] =  DF.date_time.dt.hour.isin(list(range(7, 22, 1))).astype("int")
    DF["SMC"] = np.log1p(DF["absolute_humidity"] * 100) - np.log1p(DF["relative_humidity"])
    DF["Elapsed"] = np.sin(DF.date_time.dt.day_of_year / 365 * np.pi)

    for l in lags:
        for v in Vars:
            
            m=DF[f"{v}"].mean()
            s=DF[f"{v}"].std()
            mx=DF[f"{v}"].mean()+DF[f"{v}"].std()
            mi=DF[f"{v}"].mean()-DF[f"{v}"].std()

            DF["mean{0}L{1}".format(v,l)] = DF[f"{v}"].rolling(window=l, center=True).mean().fillna(m)
            DF["max{0}L{1}".format(v,l)] = DF[f"{v}"].rolling(window=l, center=True).max().fillna(mx)
            DF["min{0}L{1}".format(v,l)] = DF[f"{v}"].rolling(window=l, center=True).min().fillna(mi)

    DF.dropna(inplace=True)

    return DF

In [None]:
%%time
trainTest = FeatEng(Orig, test.columns[4:])

In [None]:
ys=train[targetCols]

length=train.shape[0]

train = pd.concat([trainTest.iloc[:length,:], SeasAdj.iloc[:length,12:]], axis=1)
train = pd.concat([train, ys], axis=1)
test = pd.concat([trainTest.iloc[length:,:], SeasAdj.iloc[length:,12:]], axis=1)
test = test.merge(right=pslb, on = "date_time")

### Correlations

In [None]:
p=0.8
correlations = train.corr().abs()
for i in range(correlations.shape[0]):
    correlations.iloc[i,i:correlations.shape[0]] = None

cor = px.imshow(
    correlations,
    color_continuous_scale='Tealrose'
)
cor.add_trace(
    go.Contour(
    z=correlations, 
    showscale=False,
    contours=dict(
        start=p, 
        end=1, 
        size=100, 
        coloring='lines',
        operation="="
        ),
    line_width=2,
    visible=False
    )
)

cor.update_layout(
    updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=list([
                dict(
                    args=[{"visible": [True, False]}],
                    label="Clean",
                    method="update"
                ),
                dict(
                    args=[{"visible": [True, True]}],
                    label="Mark >= 0.8",
                    method="update"
                )
            ])
        )
    ],
    height=900, width=900,
    title="Absolute Correlations",
    template="simple_white"
)


cor.show()

### Normalize

In [None]:
def Normalization(DF1, DF2, cols):
    DF1=DF1.copy()
    DF2=DF2.copy()
    
    DF12 = pd.concat([DF1[cols], DF2[cols]])
    
    for c in cols:
        DF1[f"{c}"] = ((DF1["{}".format(c)]-DF12["{0}".format(c)].mean()) / DF12["{}".format(c)].std())
        
    return DF1

In [None]:
def rg(DF1, DF2, e, Vars):
    
    DF1=DF1.copy()
    length = DF1.shape[0]
    DF2=DF2.copy()
    
    DF12 = pd.concat([DF1[Vars], DF2[Vars]])
    
    for i in Vars:
        r = DF12[i].rank()
        Range = (r/r.max()-0.5)*2
        Range = np.clip(Range, a_max = 1-e, a_min = -1+e)
        rg = erfinv(Range)
        rg = rg * 2**0.5
        DF1[i] = rg[:length]
        DF2[i] = rg[length:]
        
    return DF1, DF2

In [None]:
train, test = rg(train, test, 0.000001, [i for i in train.columns if "target" not in i][1:])

In [None]:
train = pd.concat([train, test]).reset_index(drop=True)

In [None]:
#train = Normalization(train, test, [i for i in train.columns if "target" not in i][1:])
#test = Normalization(test, train, [i for i in test.columns if "target" not in i][1:])

In [None]:
tr = train.loc[train.date_time < "2010-12-31 12:00:00"]
val = train.loc[train.date_time > datetime.datetime(2010,12,31,12,0,0) + datetime.timedelta(hours=gap)]#+gap

print("tr shape: " + str(tr.shape))
print("val shape: " + str(val.shape))

# Training and Testing

In [None]:
train = train.iloc[8:,:]
targetCols=[i for i in train.columns if "target" in i]

y = train[targetCols].values
y = y.reshape((-1, 10, y.shape[1]))# samples, timesteps, features

targetCols.append("date_time")
train = train.drop(columns=targetCols).to_numpy()
train = train.reshape((-1, 10, train.shape[1]))# samples, timesteps, features
train.shape

In [None]:
apDat = {"date_time": [pd.Timestamp('2011-04-04 15:00:00'), pd.Timestamp('2011-04-04 16:00:00'), pd.Timestamp('2011-04-04 17:00:00')]}
apDat.update({c: [0,0,0] for c in test.columns[1:]})

append = pd.DataFrame(apDat)
test = pd.concat([test, append])
test = test.drop(columns=targetCols).reset_index(drop=True).to_numpy()
test = test.reshape((-1, 10, test.shape[1]))# samples, timesteps, features
test.shape

# Training and Validation

In [None]:
tr=tr.iloc[8:,:]
y_tr = tr[targetCols[:-1]].values
y_tr = y_tr.reshape((-1, 10, y_tr.shape[1]))# samples, timesteps, features

tr = tr.drop(columns=targetCols).to_numpy()
tr = tr.reshape((-1, 10, tr.shape[1]))# samples, timesteps, features
tr.shape

In [None]:
val=val.iloc[4:,:]
y_val = val[targetCols[:-1]].values
y_val = y_val.reshape((-1, 10, y_val.shape[1]))# samples, timesteps, features

val = val.drop(columns=targetCols).reset_index(drop=True).to_numpy()
val = val.reshape((-1, 10, val.shape[1]))# samples, timesteps, features
val.shape

In [None]:
def LSTM_Model(seeds=123):
    
    np.random.seed(seeds)
    tf.random.set_seed(seeds)

    INPUT = ks.layers.Input(batch_input_shape=(1, train.shape[1], train.shape[2]), name="input")
    
    L = ks.layers.LSTM(
        128, 
        kernel_initializer='LecunUniform',
        activation = "tanh", 
        return_sequences = True,
        stateful=True,
        name="L1")(INPUT)

    L = ks.layers.LSTM(
        32, 
        kernel_initializer='LecunUniform',
        activation = "sigmoid", 
        return_sequences = True, 
        stateful=True,
        #dropout=0.1,
        name="L2")(L)

    L = ks.layers.Dense(
        3, 
        activation = ks.layers.PReLU(),
        kernel_initializer='LecunUniform',
        name="L_out")(L)

    m = ks.Model(inputs=INPUT, outputs=L)
    m.summary()
    return m

In [None]:
lrReducer = ks.callbacks.ReduceLROnPlateau(    
    monitor="val_loss",
    factor=0.5,
    patience=3,
    verbose=1,
    mode="auto",
    min_delta=0.0001,
    cooldown=0,
    min_lr=0.000001,
    )

In [None]:
def rmsle(y_pred, y_true):
    y_pred = tf.cast(y_pred, dtype="float32")
    y_true = tf.cast(y_true, dtype="float32")
    r = tf.sqrt(tf.keras.backend.mean(tf.square(tf.math.log(y_pred+1) - tf.math.log(y_true+1))))
    #r = tf.sqrt(tf.keras.backend.mean(tf.square(y_pred - y_true)))
    return r

In [None]:
def lr_schaker(epoch, lr):
    if epoch == 10:
        lr = lr*1.1
    elif epoch == 40:
        lr = lr*1.1
    elif epoch == 80:
        lr = lr*1.1
    elif epoch == 160:
        lr = lr*1.1
    elif epoch == 250:
        lr = lr*1.1
    return lr

In [None]:
model = LSTM_Model()

In [None]:
leRa=0.25
dec=0.000#0
eps=444
print("Learningrate ok: " + str(leRa - dec * eps >= 0))
bs=1
clip=100

optimizer = ks.optimizers.SGD(
    lr=leRa, 
    decay=dec, 
    clipvalue=clip,
    momentum=0.85,
    nesterov=True
    )

model.compile(
    optimizer = optimizer, 
    loss = rmsle, 
    metrics="mae"
    )

history = ks.callbacks.History()

history = model.fit(
    x=train, 
    y=y,
    validation_data=(val, y_val),
    epochs = eps, 
    batch_size = bs, 
    shuffle = False,
    callbacks=[lrReducer, ks.callbacks.LearningRateScheduler(lr_schaker, verbose=0)],
    verbose=1
    )

ValPredictions = model.predict(
   x=test, 
   batch_size = bs
)

ValPredictions = pd.DataFrame(np.reshape(ValPredictions, (2250, 3)))
ValPredictions = ValPredictions.iloc[:-3,:]

Now, we use the predictions for interpolation.

In [None]:
samSub[targetCols[:-1]] = ValPredictions.values
for t in targetCols[:-1]:
    samSub[t] = np.where(pslbLEAK[t] < 0, samSub[t], pslbLEAK[t])
    
samSub[targetCols[:-1]] = samSub[targetCols[:-1]] * 0.5 + pslb[targetCols[:-1]] * 0.5

In [None]:
samSub.to_csv("Submission.csv",index=False)
samSub.describe()