In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir /content/drive/MyDrive/sarwar/model3

In [None]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm_notebook
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import gc
import pickle
from IPython.display import FileLink
import random
import joblib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Activation,BatchNormalization
from tensorflow.keras import regularizers
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import backend as K
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',100)

In [None]:
seed=2022
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED']=str(seed)

In [None]:
%%time
data_path="/content/drive/MyDrive/sarwar/"
train=pd.read_pickle(os.path.join(data_path,"train.pickle"))

In [None]:
train.head(10)

In [None]:
print("Train shape:",train.shape)

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

In [None]:
# metric
def pearson_correlation(df):
    corr=df.groupby('time_id')[['target','prediction']].corr().unstack().iloc[:,1].mean()
    return corr

In [None]:
investment_id=train['investment_id']

In [None]:
EPOCHS=20
BATCH_SIZE=1024
LR=0.01
VERBOSE=2

In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
with tf.device("cpu"):
    investment_id_lookup_layer.adapt(investment_id)

In [None]:
def preprocess(X,y):
  return X,y
def make_dataset(feature, investment_id, y, batch_size=BATCH_SIZE, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds=ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(256)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
def model3(n_features=300):
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((n_features, ), dtype=tf.float32)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dropout(0.5)(investment_id_x)
    investment_id_x = layers.Dense(32, activation='swish')(investment_id_x)
    investment_id_x = layers.Dropout(0.5)(investment_id_x)
    #investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dropout(0.5)(feature_x)
    feature_x = layers.Dense(128, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.5)(feature_x)
    feature_x = layers.Dense(64, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(64, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(16, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.5)(x)
    output = layers.Dense(1)(x)
    output = tf.keras.layers.BatchNormalization(axis=1)(output)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(LR), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

In [None]:
def train_and_evaluate(nfolds=5):
    oof_predictions = np.zeros(len(train))
    kfold = KFold(n_splits =nfolds,shuffle=False )
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train['target'].iloc[trn_ind], train['target'].iloc[val_ind]
        investment_id_train=investment_id.iloc[trn_ind]
        investment_id_val=investment_id.iloc[val_ind]
        ds_train=make_dataset(x_train, investment_id_train.values,y_train.values, batch_size=BATCH_SIZE, mode="train")
        ds_val= make_dataset(x_val,investment_id_val.values, y_val.values, batch_size=BATCH_SIZE, mode="test")
        # Reset keras session
        K.clear_session()
        n_training_rows = x_train.shape[0]
        n_validation_rows = x_val.shape[0]
        print('Building model...')
        model = model3(len(features))
        print(f'Training with {n_training_rows} rows')
        print(f'Validating with {n_validation_rows} rows')
        print(f'Training model with {len(features)+1} features...')
        # Callbacks
        #callback=tf.keras.callbacks.EarlyStopping(
        #          monitor="val_loss",
        #          min_delta=0,
        #          patience=4,
        #          verbose=1,
        #          mode="min",
        #          baseline=None,
        #          restore_best_weights=False, 
        #)
        checkpoint_path=f"/content/drive/MyDrive/sarwar/model3/fold_{fold+1}/cp.ckpt"
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, 
            monitor = 'val_loss', 
            verbose = VERBOSE, 
            save_best_only = True,
            save_weights_only = True, 
            mode = 'min', 
            save_freq = 'epoch'
        )
        # Train and evaluate
        history = model.fit(
            ds_train,
            epochs = EPOCHS,
            verbose = VERBOSE,
            callbacks = [checkpoint],
            validation_data = ds_val,
        )
        # Load model
        model.load_weights(checkpoint_path)
        # Predict validation set
        val_pred = model.predict(ds_val).reshape(-1)
        # Add validation prediction to out of folds array
        oof_predictions[val_ind] = val_pred
        del x_train, x_val, y_train, y_val,model,ds_train,ds_val
        gc.collect()
    # Compute out of folds Pearson Correlation Coefficient (for each time_id)
    oof_df = pd.DataFrame({'time_id': train['time_id'], 'target': train['target'], 'prediction': oof_predictions})
    # Save out of folds csv for blending
    oof_df.to_csv('/content/drive/MyDrive/sarwar/model3/model_3_pred.csv', index = False)
    score = pearson_correlation(oof_df)
    print(f'Our out of folds mean pearson correlation coefficient is {score}')
    return oof_df
    
oof_df=train_and_evaluate(10)