# Train 1DCNN by using lag features

This code train lag features by 1DCNN.

For more information on the lag feature, please see [this discussion](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/303147).

To save time and memory, I converted train.csv to a numpy array beforehand.
- [original features](https://www.kaggle.com/takamichitoda/ump-npy-dataset)
- [lag features](https://www.kaggle.com/takamichitoda/ump-norm-lag-1-features)
- [agg past avg features](https://www.kaggle.com/takamichitoda/ump-agg-average-value-features)

update
- Version 9: TimeSeriesSplit, use past average value features
- Version 11: hold out & train all, normalize lag features
- Version 12: hold out & train all


In [None]:
import gc
import os
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
print('Running on TPU ', tpu.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
class GCF:
    INPUT_ROOT = "/kaggle/input/ump-npy-dataset/"
    #ADD_FEATURES = "/kaggle/input/ump-norm-lag-1-features/lag_1_features_std_scaled.npy"
    ADD_FEATURES = "/kaggle/input/ump-agg-average-value-features/agg_avg_features_std_scaled.npy"
    N_TRAIN = 700_000  # 705086
    N_FOLDS = 5
    SEED = 0
    
    N_EPOCHS = 1000
    BATCH_SIZE = 4096
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3
    ALL_TRAIN_ADD_EPOCH = 3

In [None]:
def seed_everything(seed=GCF.SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
%%time

X = np.load(f"{GCF.INPUT_ROOT}/features_std_scaled.npy")
y = np.load(f"{GCF.INPUT_ROOT}/targets.npy")
time_id = np.load(f"{GCF.INPUT_ROOT}/time_id.npy")

addf = np.load(GCF.ADD_FEATURES)
X = np.hstack([X, addf])

del addf
gc.collect()

In [None]:
# https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302977

def correlationMetric(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

def correlationLoss(x,y, axis=-2):
    """Loss function that maximizes the pearson correlation coefficient between the predicted values and the labels,
    while trying to have the same mean and variance"""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xsqsum = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    ysqsum = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xsqsum * ysqsum)
    sqdif = tf.reduce_sum(tf.math.squared_difference(x, y), axis=axis) / n / tf.sqrt(ysqsum / n)
    return tf.convert_to_tensor( K.mean(tf.constant(1.0, dtype=x.dtype) - corr + (0.01 * sqdif)) , dtype=tf.float32 )


#ã€€https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301987
def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(time_id, y, pred):
    return np.mean(
        pd.DataFrame(np.stack([time_id, y, pred]).T, columns=['time_id', 'target', 'preds']
    ).groupby('time_id').apply(pearson_coef))

In [None]:
# https://www.kaggle.com/sishihara/1dcnn-for-tabular-from-moa-2nd-place
def create_model():
    model = keras.Sequential([
        layers.Dense(4096//4, activation='relu', input_shape=(300+300,)),
        layers.Reshape((256//4, 16)),
        layers.Dropout(0.75),
        layers.Conv1D(filters=16, kernel_size=5, strides=1, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='linear'),
    ])
    
    model.compile(
        optimizer=tf.optimizers.Adam(1e-4),
        #loss='mse',
        loss=correlationLoss,
        metrics=[keras.metrics.RootMeanSquaredError(), correlationMetric]
    )
    
    return model

In [None]:
is_train = np.where((time_id <= 1066) & (time_id > 1015), True, False)

is_test = time_id > 1066

sum(is_train), sum(is_test)

In [None]:
seed_everything()

with strategy.scope():
    model = create_model()

early_stopping = keras.callbacks.EarlyStopping(
    patience=GCF.EARLY_STOPPING_PATIENCE,
    min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
    restore_best_weights=True,
)
reduce_lr = ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.5,
                    patience=3,
                    min_lr=1e-5,
                    verbose=1
)

history = model.fit(
    X[is_train, :], y[is_train],
    validation_data=(X[is_test, :], y[is_test]),
    batch_size=GCF.BATCH_SIZE,
    epochs=GCF.N_EPOCHS,
    callbacks=[early_stopping, reduce_lr],
)

model.save(f"ump_1dcnn_holdout.h5")

In [None]:
valid_pred = model.predict(X[is_test, :]).reshape(1, -1)[0]
    
rmse = mean_squared_error(y[is_test], valid_pred, squared=False)
score = comp_metric(time_id[is_test], y[is_test], valid_pred)
print(f'RMSR={rmse}, SCORE={score}')

pd.DataFrame(history.history)[['loss', 'val_loss']].plot()
plt.title("loss")
plt.show()

pd.DataFrame(history.history)[['root_mean_squared_error', 'val_root_mean_squared_error']].plot()
plt.title("rmse")
plt.show()

pd.DataFrame(history.history)[['correlationMetric', 'val_correlationMetric']].plot()
plt.title("correlation")
plt.show()

In [None]:
run_epoch = len(history.history['loss'])
best_epoch = run_epoch - GCF.EARLY_STOPPING_PATIENCE
print(f"best epoch is {best_epoch}")

In [None]:
seed_everything()

with strategy.scope():
    model = create_model()

reduce_lr = ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.5,
                    patience=3,
                    min_lr=1e-5,
                    verbose=1
)

history = model.fit(
    X[is_train + is_test, :], y[is_train + is_test],
    batch_size=GCF.BATCH_SIZE,
    epochs=best_epoch + GCF.ALL_TRAIN_ADD_EPOCH,
    callbacks=[reduce_lr],
)

model.save(f"ump_1dcnn_all_train.h5")

In [None]:
pd.DataFrame(history.history)[['loss']].plot()
plt.title("loss")
plt.show()

pd.DataFrame(history.history)[['root_mean_squared_error']].plot()
plt.title("rmse")
plt.show()

pd.DataFrame(history.history)[['correlationMetric']].plot()
plt.title("correlation")
plt.show()

In [None]:
!ls 