In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import lightgbm
from scipy.stats import probplot, pearsonr
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import ubiquant
from scipy.stats import pearsonr
import tensorflow.keras.layers as L
from tensorflow.python.ops import math_ops
import tensorflow.keras.models as M
import tensorflow.keras.backend as K

In [None]:
train_low = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet', engine='pyarrow') 
train_low.describe()

In [None]:
f_col = train_low.drop(['row_id','time_id','investment_id','target'],axis=1).columns
f_col

In [None]:
def anomaly_detect(df_train):
    outlier_list = []
    outlier_col = []

    for col in f_col :

        temp_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 70) |
                           (df_train[col] < df_train[col].mean() - df_train[col].std() * 70) ]
        temp2_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 35) |
                            (df_train[col] < df_train[col].mean() - df_train[col].std() * 35) ]
        if len(temp_df) >0 : 
            outliers = temp_df.index.to_list()
            outlier_list.extend(outliers)
            outlier_col.append(col)
            print(col, len(temp_df))
        elif len(temp2_df)>0 and len(temp2_df) <6 :
            outliers = temp2_df.index.to_list()
            outlier_list.extend(outliers)
            outlier_col.append(col)
            print(col, len(temp2_df))

    outlier_list = list(set(outlier_list))
    print(len(outlier_col), len(outlier_list))
    return outlier_col, outlier_list

outlier_col, outlier_list = anomaly_detect(train_low)
train_low.drop(train_low.index[outlier_list], inplace = True)
train_low

In [None]:
scaler = StandardScaler()
scaler.fit(pd.DataFrame(train_low['investment_id']))


In [None]:
def make_dataset(df):
    inv_df = df['investment_id']
    f_df = df[f_col]
    scaled_investment_id = scaler.transform(pd.DataFrame(inv_df))
    df['investment_id'] = scaled_investment_id
    data_x = pd.concat([df['investment_id'], f_df], axis=1)
    return data_x

def make_dataset_train(df):
    f_df = df[f_col]
    data_x = pd.concat([df['investment_id'], f_df], axis=1)
    return data_x

In [None]:
df = train_low.astype('float16')
df_x = make_dataset(df)
df_x

In [None]:
df_y = pd.DataFrame(df['target'])
df_y

In [None]:
del df

In [None]:
def get_rnn_v2():
    f300_in = L.Input(shape = [301], name='301 feature input')
    x = L.BatchNormalization(name='batch_norm1')(f300_in)
    x = L.Dense(256, activation='swish', name='dense1')(x)
    x = L.Dropout(0.1, name='dropout1')(x)
    x = L.Reshape((1, -1), name='reshape1')(x)
    x = L.BatchNormalization(name='batch_norm2')(x)
    x = L.LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True, activation='relu', name='lstm1')(x)
    x = L.LSTM(16, dropout=0.1, return_sequences=False, activation='relu', name='lstm2')(x)
    output_layer = L.Dense(1, name='output')(x)

    model = M.Model([f300_in], 
                    [output_layer])
    
    learning_sch = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = 0.003,
        decay_steps = 9700,
        decay_rate = 0.98
    )
    adam = tf.keras.optimizers.Adam(learning_rate = learning_sch)
    rmse = tf.keras.metrics.RootMeanSquaredError()
    model.compile(optimizer=adam, loss='mse', metrics=[rmse])

    return model

In [None]:
def get_gru():
    inp = tf.keras.layers.Input(shape = [df_x.shape[1]], name = "input_layer")
    x = tf.keras.layers.Dense(256, activation = "swish")(inp)
    x = tf.keras.layers.Dense(256, activation = "swish")(x)
    x = tf.keras.layers.Dense(256, activation = "swish")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Reshape((1, -1))(x)
    x = tf.keras.layers.GRU(128, recurrent_dropout = 0.2, dropout = 0.2, return_sequences = True)(x)
    x = tf.keras.layers.GRU(128, recurrent_dropout = 0.1, dropout = 0.1, return_sequences = True)(x)
    x = tf.keras.layers.GRU(128, recurrent_dropout = 0.1, dropout = 0.1, return_sequences = False)(x)
    x = tf.keras.layers.Dense(16, activation = "swish")(x)
    x = tf.keras.layers.Dense(16, activation = "swish")(x)
    x = tf.keras.layers.Dense(16, activation = "swish")(x)
    out = tf.keras.layers.Dense(1, name = "output_layer")(x)
    
    model = tf.keras.Model(inp, out)
    
    learning_sch = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = 0.003,
        decay_steps = 9700,
        decay_rate = 0.98
    )
    adam = tf.keras.optimizers.Adam(learning_rate = learning_sch)
    rmse = tf.keras.metrics.RootMeanSquaredError()
    model.compile(optimizer=adam, loss='mse', metrics=[rmse])
    
    return model

In [None]:
def pythonash_model():
    inputs_ = tf.keras.Input(shape = [301])
    x = tf.keras.layers.Dense(64, kernel_initializer = 'he_normal')(inputs_)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(128, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(256, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(512, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(256, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    drop = tf.keras.layers.Dropout(0.4)(leaky)
    
    x = tf.keras.layers.Dense(128, kernel_initializer = 'he_normal')(drop)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(8, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    drop = tf.keras.layers.Dropout(0.4)(leaky)
    
    outputs_ = tf.keras.layers.Dense(1)(drop)
    
    model = tf.keras.Model(inputs = inputs_, outputs = outputs_)
    
    rmse = tf.keras.metrics.RootMeanSquaredError()

    learning_sch = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.003,
    decay_steps = 9700,
    decay_rate = 0.98)
    adam = tf.keras.optimizers.Adam(learning_rate = learning_sch)
    
    model.compile(loss = 'mse', metrics = rmse, optimizer = adam)
    return model

pythonash_model().summary()

In [None]:
def get_dnn_inv():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

In [None]:
#tf.keras.utils.plot_model(pythonash_model(),show_shapes=True,expand_nested=True)

In [None]:
investment_id_lookup_layer = L.IntegerLookup(max_tokens=train_low.investment_id.nunique())
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":train_low.investment_id}))

In [None]:
def correlationMetric(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr


def correlationLoss(x,y, axis=-2):
    """Loss function that maximizes the pearson correlation coefficient between the predicted values and the labels,
    while trying to have the same mean and variance"""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xsqsum = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    ysqsum = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xsqsum * ysqsum)
    return tf.convert_to_tensor( K.mean(tf.constant(1.0, dtype=x.dtype) - corr ) , dtype=tf.float32 )

def correlationMetric_01mse(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

gc.collect()

# list(GroupKFold(5).split(train , groups = train.index))[0]
def pearson_coef(data):
    return data.corr()['target']['preds']

def evaluate_metric(valid_df):
    return np.mean(valid_df[['time_id_', 'target', 'preds']].groupby('time_id').apply(pearson_coef))


def get_model_corr(ft_units, x_units, x_dropout):
    
    # investment_id
    inputs = tf.keras.Input((301, ), dtype=tf.float16)
    investment_id_x = investment_id_lookup_layer(inputs[:,0:1])
    investment_id_x = L.Embedding(train_low.investment_id.nunique(), 32, input_length=1)(investment_id_x)
    investment_id_x = L.Reshape((-1, ))(investment_id_x)
    investment_id_x = L.Dense(128, activation='swish')(investment_id_x)
    investment_id_x = L.Dense(128, activation='swish')(investment_id_x) 
    investment_id_x = L.Dense(128, activation='swish')(investment_id_x)
    
    bn = tf.keras.layers.BatchNormalization()(inputs[:,1:])
    gn = tf.keras.layers.GaussianNoise(0.035)(bn)
    feature_x = L.Dense(300, activation='swish')(gn)
    feature_x = tf.keras.layers.Dropout(0.5)(feature_x)
    
    for hu in ft_units:
        feature_x = L.Dense(hu, activation='swish')(feature_x)
        feature_x = tf.keras.layers.Dropout(0.35)(feature_x)
    
    x = L.Concatenate(axis=1)([investment_id_x, feature_x])
    
    for i in range(len(x_units)):
        x = tf.keras.layers.Dense(x_units[i], kernel_regularizer="l2")(x) 
        x = tf.keras.layers.Activation('swish')(x)
        x = tf.keras.layers.Dropout(x_dropout[i])(x)
        
    output = L.Dense(1)(x)
    model = tf.keras.Model(inputs=[inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.0001), loss=correlationLoss, 
                  metrics=['mse', "mae", correlationMetric])
    return model


params = {
    'ft_units': [150, 75, 150 ,200],
    'x_units': [512, 256, 128, 32],
    'x_dropout': [0.44, 0.4, 0.33, 0.2] #4, 3, 2, 1
         }

In [None]:
# from IPython.display import clear_output

# def fit_1fold(get_model_func, train_index, val_index, fold_i, model_name, params=None):
#     print(f"{get_model_func} fold = {fold_i}")
#     epochs = 20
#     train_x, train_y = df_x.iloc[train_index], df_y.iloc[train_index]
#     tf_train = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)
    
#     if params:
#         model = get_model_func(**params)
#     else:
#         model = get_model_func()

    
#     if val_index is not None:
#         val_x, val_y = df_x.iloc[val_index], df_y.iloc[val_index]
#         tf_val = tf.data.Dataset.from_tensor_slices((val_x, val_y)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)
#         epochs = 50
        
#         callbacks = [
#             tf.keras.callbacks.ModelCheckpoint(f'{model_name}_{fold_i}.h5', save_best_only = True),
#             tf.keras.callbacks.EarlyStopping(min_delta=0.0001, patience=7, verbose=1)
#         ]
#         model.fit(tf_train, callbacks=callbacks, epochs=epochs, validation_data=(tf_val), shuffle=True)
#     else:
#         callbacks = [
#             tf.keras.callbacks.ModelCheckpoint(f'{model_name}_{fold_i}.h5', save_best_only=True, monitor='loss'),
#             tf.keras.callbacks.EarlyStopping(min_delta=0.0001, patience=7, verbose=1, monitor='loss')
#         ]
#         model.fit(tf_train, callbacks=callbacks, epochs=epochs, shuffle=True)

#     clear_output()
    
#     del tf_train
    
#     if val_index is not None:
#         del tf_val
#     del model
#     gc.collect()
    

# kfold_generator = TimeSeriesSplit(max_train_size=int(df_x.shape[0] * 0.6))

# i = 0
# train_val_ids = list(kfold_generator.split(df_x, df_y))
# train_val_ids.append((np.arange(int(df_x.shape[0] * 0.41), df_x.shape[0]), None))

# # for train_index, val_index in train_val_ids:
# #     fit_1fold(get_model_corr, train_index, val_index, i, 'dnn_corr', params)
# #     i += 1


# gc.collect()

# for train_index, val_index in train_val_ids:
#     fit_1fold(get_gru, train_index, val_index, i, 'gru')
#     i += 1


# gc.collect()

# for train_index, val_index in train_val_ids:
#     fit_1fold(pythonash_model, train_index, val_index, i, 'pythonash_model')
#     i += 1


# gc.collect()

# i = 0
# for train_index, val_index in train_val_ids:
#     fit_1fold(get_rnn_v2, train_index, val_index, i, 'rnn')
#     i += 1

In [None]:
# train_val_ids
# 

In [None]:
env = ubiquant.make_env()   
iter_test = env.iter_test()   
coefs = [0, 0, 0, 0.1, 0.15, 0.75]
for (test_df, sample_prediction_df) in iter_test:
    test_df_corr = make_dataset_train(test_df)
    test_df = make_dataset(test_df)
    pred_dnn = None
    pred_rnn = None
    pred_gru = None
    pred_dnn_corr = None
    for i in range(3, 6):
        model = tf.keras.models.load_model(f'../input/ubmodels/pythonash_model_{i+6}.h5')
        if pred_dnn is None:
            pred_dnn = coefs[i] * model.predict(test_df)
        else: 
            pred_dnn += coefs[i] * model.predict(test_df)
            
        model = tf.keras.models.load_model(f'../input/ubmodels/rnn_{i}.h5')
        if pred_rnn is None:
            pred_rnn = coefs[i] * model.predict(test_df)
        else: 
            pred_rnn += coefs[i] * model.predict(test_df)
            
#         model = tf.keras.models.load_model(f'../input/ubmodels/gru_{i}.h5')    
#         if pred_gru is None:
#             pred_gru = coefs[i] * model.predict(test_df)
#         else: 
#             pred_gru += coefs[i] * model.predict(test_df)
        

        model = tf.keras.models.load_model(
            f'../input/ubmodels/dnn_corr_{i}.tf',
            custom_objects={
                'correlationMetric':correlationMetric,
                'correlationLoss': correlationLoss
            })        
        if pred_dnn_corr is None:
            pred_dnn_corr = coefs[i] * model.predict(test_df_corr)
        else: 
            pred_dnn_corr += coefs[i] * model.predict(test_df_corr)
            
    sample_prediction_df['target'] = pred_dnn_corr*0.5 + pred_dnn*0.3 + pred_rnn*0.2
    env.predict(sample_prediction_df)
    gc.collect()