In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K
from ipywidgets import interactive_output
from ipywidgets import Text, HBox, VBox, Select, fixed
import math
import random
import lightgbm
from scipy.stats import probplot, pearsonr

import seaborn as sns

### add random seed
tf.random.set_seed(3)

In [None]:
train_data = pd.read_parquet('../input/ubiquant-parquet-low-mem/train_low_mem.parquet')
train_data = train_data.drop('row_id', axis=1)
train_data.loc[:, ['time_id', 'investment_id']] = train_data.loc[:, ['time_id', 'investment_id']].astype(np.int16)
train_data = train_data.sort_values(['time_id', 'investment_id'], ascending=True)

In [None]:
print('Rows and Columns in train dataset:', train_data.shape)
print('Missing values in train dataset:', sum(train_data.isnull().sum()))

In [None]:
import random
colors = sns.color_palette('Wistia', 20)
mean_target = train_data.groupby(['investment_id'])['target'].mean()
fig, axes = plt.subplots(3, figsize=(20, 10), sharex=True)

train_data.groupby('time_id')['investment_id'].nunique().plot(color=random.choice(colors), ax=axes[0])


train_data.groupby('time_id')['target'].mean().plot(color=random.choice(colors), ax=axes[1])

axes[1].axhline(y=np.mean(mean_target), color='black', linestyle='--', label="mean")


train_data.groupby('time_id')['target'].std().plot(color=random.choice(colors), ax=axes[2])

axes[2].axhline(y=np.mean(train_data.groupby('time_id')['target'].std()), color='black', linestyle='--', label="mean")

plt.show()

In [None]:
def scatter_hist(plot_data, col_x, col_y, hue=None, title=None):
    fig, axes = plt.subplots(2, 2, figsize=(10, 10), facecolor='white',gridspec_kw={'width_ratios': [9, 1],'height_ratios': [1, 9]})
    sns.scatterplot(data=plot_data, x=col_x, y=col_y, ax=axes[1, 0], hue=hue)
    sns.histplot(data=plot_data, x=col_x, ax=axes[0, 0], hue=hue, kde=True)
    sns.histplot(data=plot_data, y=col_y,ax=axes[1, 1], hue=hue, kde=True)
                             
    axes = axes.ravel()
    for i in range(len(axes)):
        axes[i].grid(True)
        if i != 2:
            axes[i].tick_params(length=0)
            axes[i].xaxis.set_visible(False)
            axes[i].yaxis.set_visible(False)
            for loc in ['top', 'bottom', 'left', 'right']:
                axes[i].spines[loc].set_visible(False)
    if type(title) is str:
        fig.suptitle(title)
    fig.tight_layout(rect=[0, 0, 0.96, 1])
    plt.close()
    return fig


def scatter_groupby(dataframe, col_x, agg_x, col_y, agg_y, groupby):
    target_meam_by_time_id = dataframe.loc[:, ['time_id', 'investment_id', col_y]].groupby(groupby).agg(agg_y).copy()
    feats_nuni_by_time_id = dataframe.loc[:, ['time_id', 'investment_id', col_x]].groupby(groupby).agg(agg_x).copy()
    plot_data = pd.concat([target_meam_by_time_id, feats_nuni_by_time_id], axis=1)
    title = f'X: {col_x}({agg_x}) Y: {col_y}({agg_y}) / groupby: {groupby}'
    fig = scatter_hist(plot_data, col_x, col_y, title=title)
    display(fig)

In [None]:
data = fixed(train_data)
s_x = Select(description='X', options=train_data.columns[2:], value='f_62', rows=4,)
s_y = Select(description='Y', options=train_data.columns[2:], value='target', rows=4,)
s_aggx = Select(description='X_aggregate', options=['mean', 'std', 'nunique', 'min', 'max'], value='nunique', rows=4,)   
s_aggy = Select(description='Y_aggregate', options=['mean', 'std', 'nunique', 'min', 'max'], value='mean', rows=4,)   
s_gby = Select(description='groupby', options=['time_id', 'investment_id'], value='time_id', rows=2,) 

selector = HBox([s_x, s_aggx, s_y, s_aggy, s_gby])
plot_output = interactive_output(scatter_groupby, dict(dataframe=data, col_x=s_x, agg_x=s_aggx, col_y=s_y, agg_y=s_aggy, groupby=s_gby))
display(selector, plot_output)

In [None]:
class Config:
#     is_training = False
    is_training = True
    tf_record_dataset_path = "../input/ump-combinatorialpurgedgroupkfold-tf-record/"
    output_dataset_path = "../input/ubiquant-market-prediction-with-dnn-output/"
config = Config()

In [None]:
%%time
investment_ids = pd.read_csv("../input/ump-combinatorialpurgedgroupkfold-tf-record/investment_ids.csv")
investment_id_size = len(investment_ids) + 1
with tf.device("cpu"):
    investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
    investment_id_lookup_layer.adapt(investment_ids)

In [None]:
def decode_function(record_bytes):
  return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

def preprocess(item):
    return (item["features"]), item["target"]
def make_dataset(file_paths, batch_size=4096, mode="train"):
    ds = tf.data.TFRecordDataset(file_paths)
    ds = ds.map(decode_function)
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(batch_size * 4)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
def correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    
    
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)

    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

def get_model():

    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    
    ## feature ##
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dropout(0.1)(feature_x)
    ## convolution 1 ##
    feature_x = layers.Reshape((-1,1))(feature_x)
    feature_x = layers.Conv1D(filters=16, kernel_size=4, strides=1, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## convolution 2 ##
    feature_x = layers.Conv1D(filters=16, kernel_size=4, strides=4, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## convolution 3 ##
    feature_x = layers.Conv1D(filters=64, kernel_size=4, strides=1, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## convolution 4 ##
    feature_x = layers.Conv1D(filters=64, kernel_size=4, strides=4, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## convolution 5 ##
    feature_x = layers.Conv1D(filters=64, kernel_size=4, strides=4, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## convolution 6 ##
    feature_x = layers.Conv1D(filters=64, kernel_size=4, strides=2, padding='same')(feature_x)
    feature_x = layers.BatchNormalization()(feature_x)
    feature_x = layers.LeakyReLU()(feature_x)
    ## flatten ##
    feature_x = layers.Flatten()(feature_x)
    

    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(feature_x)
    
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.1)(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")

    model = tf.keras.Model(inputs=[features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse, correlation])
    return model

In [None]:
model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
%%time
models = []
for i in range(1):
    train_path = f"{config.tf_record_dataset_path}fold_{i}_train.tfrecords"
    valid_path = f"{config.tf_record_dataset_path}fold_{i}_test.tfrecords"
    valid_ds = make_dataset([valid_path], mode="valid")
    print(valid_ds)
    model = get_model()
    if config.is_training:
        train_ds = make_dataset([train_path])
        checkpoint = keras.callbacks.ModelCheckpoint(f"model_{i}.tf", monitor="val_correlation", mode="min", save_best_only=True, save_weights_only=True)
        early_stop = keras.callbacks.EarlyStopping(patience=10)
        history = model.fit(train_ds, epochs=40, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
        model.load_weights(f"model_{i}.tf")
        for metric in ["loss", "mae", "mape", "rmse", "correlation"]:
            pd.DataFrame(history.history, columns=[metric, f"val_{metric}"]).plot()
            plt.title(metric.upper())
            plt.show()
    else:
        model.load_weights(f"{config.output_dataset_path}model_{i}.tf")
    y_vals = []
    for _, y in valid_ds:
        y_vals += list(y.numpy().reshape(-1))
    y_val = np.array(y_vals)
    pearson_score = stats.pearsonr(model.predict(valid_ds).reshape(-1), y_val)[0]
    models.append(model)
    print(f"Pearson Score: {pearson_score}")


In [None]:
def make_test_dataset(feature, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((feature)))
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
features = [f"f_{i}" for i in range(300)]
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df) 