In [None]:
# Import Libraries
import gc
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, GRU, BatchNormalization, Reshape
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers.schedules import ExponentialDecay

import warnings
warnings.simplefilter('ignore')

In [None]:
# Constants & Path 
AUTOTUNE = tf.data.experimental.AUTOTUNE 
tf.config.run_functions_eagerly(True)
batch_size = 4096
epochs = 10

train_path = "../input/ubiquant-parquet/train_low_mem.parquet"

In [None]:
# Read Data 
df = pd.read_parquet(train_path)
df = df.astype("float16")
df.drop(["row_id", "investment_id"], axis = 1, inplace = True)
df

In [None]:
features = ["f_{}".format(i) for i in range(300)]

In [None]:
# Create Tensorflow dataset
def train_dataset(data):
    dataset = tf.data.Dataset.from_tensor_slices((data[features], data["target"]))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def test_dataset(data):
    dataset = tf.data.Dataset.from_tensor_slices((data[features]))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
# Initialize model
def create_model():
    inp = Input(shape = (300, ), name = "input_layer")
    x = Dense(256, activation = "gelu")(inp)
    x = Dense(256, activation = "gelu")(x)
    x = Dense(256, activation = "gelu")(x)
    x = BatchNormalization()(x)
    x = Reshape((1, -1))(inp)
    x = GRU(128, recurrent_dropout = 0.2, dropout = 0.2, return_sequences = True)(x)
    x = GRU(128, recurrent_dropout = 0.1, dropout = 0.1, return_sequences = True)(x)
    x = GRU(128, recurrent_dropout = 0.1, dropout = 0.1, return_sequences = False)(x)
    x = Dense(64, activation = "gelu")(x)
    x = Dense(64, activation = "gelu")(x)
    x = Dense(64, activation = "gelu")(x)
    out = Dense(1, name = "output_layer")(x)

    model = Model(inp, out)
    model.compile(
        optimizer = Adam(learning_rate = 0.001),
        loss = "mse",
        metrics = ["mse"]
    )
    
    return model

In [None]:
df["fold"] = -1
kf = StratifiedKFold(n_splits = 5)

for f, (t_, v_) in enumerate(kf.split(df[features], df["time_id"])):
        df.loc[v_, 'fold'] = f

In [None]:
early_stopping = EarlyStopping(patience = 3, restore_best_weights = True, monitor = "val_loss", mode = "min")

In [None]:
score = []
print("Training Started.....")

for i in range(5):
    train = train_dataset(df[df["fold"] != i])
    
    temp = df[df["fold"] == i]  
    test = test_dataset(temp)
    
    model = create_model()
    model.fit(train, 
              epochs = epochs,
              callbacks = [early_stopping, ModelCheckpoint(
                                                filepath = "RNN_model_{}.hdf5".format(i),
                                                save_weights_only=True,
                                                monitor='loss',
                                                mode='min',
                                                save_best_only=True
                                            )]
             )
    
    pred = model.predict(test)
    pred = pred.astype("float16").flatten()
    corr, _ = pearsonr(pred, temp["target"])
    score.append(corr)
    print("\n***** Pearson Correlation is {} *****\n".format(corr))

    del model, train, test, temp
    gc.collect()

print("***** Average Pearson Correlation is {} *****".format(np.mean(score)))

In [None]:
# For Submission
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df[features]
    final_pred = []
    for i in range(5):
        model = create_model()
        model.load_weights("RNN_model_{}.hdf5".format(i))
        pred = model.predict(test_df)
        final_pred.append(pred)
        del model
        gc.collect()
        
    sample_prediction_df['target'] = np.mean(np.stack(final_pred), axis=0)  
    env.predict(sample_prediction_df) 