In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed

import shutil
import glob
import gc 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from scipy.stats import pearsonr as p
import psutil
import warnings
warnings.filterwarnings("ignore")

In [None]:
%%time
train = pd.read_pickle('../input/ump-train-picklefile/train.pkl')
train = train[train.time_id>1000]

In [None]:
test = pd.read_csv('../input/ubiquant-market-prediction/example_test.csv')

In [None]:
X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
X_test=test.copy()
X_test.drop(['time_id','row_id'], axis=1,inplace=True)

In [None]:
nunique = X.nunique()
types = X.dtypes

categorical_columns = 'investment_id'
categorical_dims =  {}

# for col in X.columns:
#     if  col == 'investment_id':
#         l_enc = LabelEncoder()
#         X[col] = l_enc.fit_transform(X[col].values)
#         X_test[col] = l_enc.transform(X_test[col].values)
#         categorical_columns.append(col)
#         categorical_dims[col] = len(l_enc.classes_)
#     else:
#         scaler = StandardScaler()
#         X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
#         X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
        
l_enc = LabelEncoder()
X[categorical_columns] = l_enc.fit_transform(X[categorical_columns].values)
X_test[categorical_columns] = l_enc.transform(X_test[categorical_columns].values)
# categorical_columns.append(categorical_columns)
categorical_dims[categorical_columns] = len(l_enc.classes_)

cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

In [None]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10
    
)

In [None]:
def rmse(y_true, y_pred):
    # Function to calculate the root mean squared percentage error
    return np.sqrt(np.mean(np.square((y_true - y_pred))))

class RMSE(Metric):
    def __init__(self):
        self._name = "rmse"
        self._maximize = False

    def __call__(self, y_true, y_score):
        
        return np.sqrt(np.mean(np.square((y_true - y_score))))
    


def RMSELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred)) ** 2 )).clone()

In [None]:
kfold = KFold(n_splits = 5, random_state = 42, shuffle = True)
# Create out of folds array
oof_predictions = np.zeros((X.shape[0], 1))
test_predictions = np.zeros(X_test.shape[0])
feature_importances = pd.DataFrame()
feature_importances["feature"] = X.columns.tolist()
stats = pd.DataFrame()
explain_matrices = []
masks_ =[]
models = []
for fold, (trn_ind, val_ind) in enumerate(kfold.split(X)):
    print(f'Training fold {fold + 1}')
    X_train, X_val = X.iloc[trn_ind].values, X.iloc[val_ind].values
    y_train, y_val = y.iloc[trn_ind].values.reshape(-1,1), y.iloc[val_ind].values.reshape(-1,1)


    clf =  TabNetRegressor(**tabnet_params)
    clf.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      max_epochs = 100,
      patience = 10,
      batch_size = 1024, 
      virtual_batch_size = 128,
      num_workers = 4,
      drop_last = False,
      eval_metric=[RMSE],
      loss_fn=RMSELoss
      )
    
    saving_path_name = f"./tabnet_fold{fold}"
    saved_filepath = clf.save_model(saving_path_name)
    

    oof_predictions[val_ind] = clf.predict(X_val)
    print('>'*20,f'fold_{fold} score: {p(y_val.squeeze(-1),oof_predictions[val_ind].squeeze(-1))[0]}','<'*20)
    
    models.append(clf)
    
    feature_importances[f"importance_fold{fold}+1"] = clf.feature_importances_
    
    del clf
    torch.cuda.empty_cache()
    gc.collect()

    
print(f'OOF score across folds: {p(y, oof_predictions.flatten())[0]}')

In [None]:
feature_importances['mean_importance']=feature_importances[['importance_fold0+1','importance_fold1+1']].mean(axis=1)
feature_importances.sort_values(by='mean_importance', ascending=False, inplace=True)
sns.barplot(y=feature_importances['feature'][:25],x=feature_importances['mean_importance'][:25], palette='inferno')
plt.title('Mean Feature Importance by Folds')
plt.show()

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df["target"] = 0.0
    X_test = test_df.drop(["row_id"], axis=1)
    X_test[categorical_columns] = l_enc.transform(X_test[categorical_columns].values)
    y_preds = [model.predict(X_test.values).squeeze(-1) for model in models]
    sample_prediction_df["target"] = sum(y_preds) / len(y_preds)
    sample_prediction_df["target"] = sample_prediction_df["target"].fillna(0.0)
    env.predict(sample_prediction_df)

In [None]:
sample_prediction_df