# Intorduction
This notebook is EDA and training by using tabnet.  
RAPIDS(cuDF) is GPU DataFrame library for loading, aggregating, filtering, and otherwise manipulating data.  (<a href='https://docs.rapids.ai/api/cudf/stable/'>Ref</a>)  <br>
  
ver.14: cuDF dataframe is used in EDA.

# Package

In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

In [None]:
import copy
import gc
import glob
import os
import pickle
import random

import argparse
import cudf
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from tqdm.auto import tqdm

import cudf
print('RAPIDS version',cudf.__version__)

In [None]:
# Paths
INPUT_PATH= '../input/ubiquant-market-prediction'
TRAIN_PICKLE = '../input/fast-read-data-ubiquant/train_reduced.pkl' # train.csv reduced memory usage

# Train parematers
DEBUG = False 
'''
If DEBUG==True, 
reduced sampling is performed for train data,
and the fold calculation is stopped at the first fold.  
'''
args = argparse.Namespace(
    seed = 2022,
    patience = 20,
    batch_size = 1024*20, 
    virtual_batch_size = 128*20,
    drop_last = True,
    reduced_sampling = None if not DEBUG else 0.10, # if DEBUG, 10% samples are used.
    max_epochs = 200 if not DEBUG else 5,
    n_folds = 5, # if DEBUG, one fold is only calculated.
    n_steps = 2, # equals to the number of masks
    n_workers = 2,
    n_bins = 16
)

# Random seed
def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(args.seed)

# Matplotlib style
plt.style.use('ggplot')
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = 'gray'
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['grid.linestyle'] = '--'

# Input data

In [None]:
%%time
df_train = pd.read_pickle(TRAIN_PICKLE)
display(df_train.head())

# Short EDA

### There are "target==0" data. If you use the loss function like RSMPE, the value becames inf.

### From above figures, almost the features have mean ≃ 0.0, std ≃ 1.0 except for some features:
Small std  : f_124, f_170, f175, f_272  
Small mean : f_170, f_175  
Large mean : f_41, f_182, f_246  

### Investigate these distributions as follows

## Change the range of the histgrams between -10 and 10.¶

### From above,
- Most of the features are not normal and some have multimodal.
- Some singular peek exist in a distribution. Especially, f_124 is like delta functions.

# Preprocessing

### KFold

In [None]:
def add_fold_column(df):
    # Create 'time_span' column, which is used for stratified KFold.
    df_time = (df.loc[:,['time_id', 'investment_id']]
               .groupby('investment_id')
               .agg({'time_id': ['min', 'max']})
               .reset_index())
    df_time['time_span'] = df_time['time_id']['max'] - df_time['time_id']['min']
    display(df_time.head())
    
    # Merge 'time_span' to df
    df_time = pd.DataFrame(df_time.to_pandas()
                           .droplevel(level=1, axis=1)
                           .drop('time_id' ,axis=1))
    df_time = cudf.DataFrame.from_pandas(df_time)
    df = df.merge(df_time, on=['investment_id'])
   
    # Holdout
    _target = cudf.cut(df['time_span'], args.n_bins, labels=False)
    _train, _valid = train_test_split(_target,
                                      stratify=_target.values.get(),
                                      random_state=args.seed)
    print(f'Number of holdout records: {len(_valid)}')
    df = df.iloc[_train.index].sort_values(by=['time_id', 'investment_id'])\
           .reset_index(drop=True)
    
    # StratifiedKFold
    df["fold"] = -1
    _target = cudf.cut(df['time_span'], args.n_bins, labels=False) 
    skfold = StratifiedKFold(n_splits=args.n_folds)
    for fold, (train_idx, valid_idx) in enumerate(skfold.split(_target, _target.values.get())):
        df.loc[valid_idx, 'fold'] = fold
       
    return df

In [None]:
cudf_train = add_fold_column(cudf_train)
display(cudf_train.head())

### Create features
ref. https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline

In [None]:
def create_features(df):
    cat_features = ['investment_id']
    num_features = [f'f_{i}' for i in range(300)]
    features = num_features + cat_features

    combination_features = ['f_231-f_250', 'f_118-f_280', 'f_155-f_297','f_25-f_237',
                            'f_179-f_265', 'f_119-f_270', 'f_71-f_197', 'f_21-f_65']
    for f in combination_features:
        f1, f2 = f.split('-')
        df[f] = df[f1] + df[f2]
    
    features += combination_features
    drop_features = ['f_148', 'f_72', 'f_49', 'f_205', 'f_228', 'f_97', 'f_262', 'f_258']
    features = list(sorted(set(features).difference(set(drop_features))))
    df = df.drop(drop_features, axis=1)
                     
    return df, features

In [None]:
cudf_train, features = create_features(cudf_train)
cudf_train = cudf_train.drop(['time_id', 'time_span'], axis=1)
print('len(features):', len(features))
print(features)
display(cudf_train.head())

# Training

In [None]:
# Tabnet parameters    
tabnet_params = dict(
            cat_idxs = [i for i, f in enumerate(df_train.columns.tolist()) if f in ['investment_id']],
            cat_emb_dim = 1,
            n_d = 16,
            n_a = 16,
            n_steps = args.n_steps,
            gamma = 2,
            n_independent = 2,
            n_shared = 2,
            lambda_sparse = 0,
            optimizer_fn = Adam,
            optimizer_params = dict(lr = (2e-2)),
            mask_type = 'entmax',
            scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, 
                                    last_epoch=-1, verbose=False),
            scheduler_fn = CosineAnnealingWarmRestarts,
            seed = args.seed,
            verbose = 10
        )

# Metric
class PearsonCorrelation(Metric):
    def __init__(self):
        self._name = 'pearson_corr'
        self._maximize = True

    def __call__(self, x, y):
        x = x.squeeze()
        y = y.squeeze()
        x_diff = x - np.mean(x)
        y_diff = y - np.mean(y)
        return np.dot(x_diff, y_diff)/(np.sqrt(sum(x_diff**2))*np.sqrt(sum(y_diff**2)))

# Train run
def train(df_train, features, args, tabnet_params, debug=DEBUG):

    if debug: # Reduced sampling
        print('Run as DEBUG')
        n_samples = len(df_train)
        sample_idx = df_train.sample(int(n_samples*args.reduced_sampling), random_state=args.seed).index
        df_train = df_train.iloc[sample_idx].reset_index(drop=True)
        print('len(df_train):', len(df_train))
        del sample_idx
        _ = gc.collect()
    
    ###### Outputs ######
    histories = {}
    oof_predictions = pd.DataFrame({'row_id': df_train['row_id'].to_pandas()}) # predictions for validatation data
    feature_importances = pd.DataFrame({'features': features})
    masks = {}
#     explain_matrices = {}
    ####################

    for fold in tqdm(range(args.n_folds)):

        print(f"\n{'>'*15} Fold {fold} {'<'*15}")
        
        # Preapare train and valid data
        train_idx = df_train['fold']!=fold
        valid_idx = df_train['fold']==fold
        X_train = df_train.loc[train_idx, features].values.get()
        X_valid = df_train.loc[valid_idx, features].values.get()
        y_train = df_train.loc[train_idx,'target'].values.get().reshape(-1,1)
        y_valid = df_train.loc[valid_idx,'target'].values.get().reshape(-1,1)
        print(f'len(X_train): {len(X_train)}, len(X_valid): {len(X_valid)},')
        
        # Model initialize
        model = TabNetRegressor(**tabnet_params)
        
        # Train
        model.fit(
            X_train, y_train,
            eval_set = [(X_valid, y_valid)],
            max_epochs = args.max_epochs,
            patience = args.patience,
            batch_size = args.batch_size, 
            virtual_batch_size = args.virtual_batch_size,
            num_workers = args.n_workers,
            drop_last = args.drop_last,
            eval_metric = [PearsonCorrelation],
            loss_fn = torch.nn.MSELoss() # torch.nn.L1Loss()
        )

        # Save model
        model.save_model(f'./tabnet_fold{fold}')
        # Scores
        histories[f'fold{fold}'] = model.history
        # Predict for validation data
        oof_predictions.loc[valid_idx.values.get(), 'pred'] = model.predict(X_valid).astype('float16')
        # Feature importances
        feature_importances[f'importance_fold{fold}'] = model.feature_importances_
        # Explain matrices and Masks
#         explain_matrices_, masks_ = model.explain(X_valid)
#         explain_matrices[f'fold{fold}'] = explain_matrices_
#         masks[f'fold{fold}'] = masks_

        del model, X_train, X_valid, y_train, y_valid
        torch.cuda.empty_cache()
        _ = gc.collect()
        
        if debug:
            print('Debug: stop calculation at first epoch')
            break
        
    outputs = dict(
        histories = histories,
        feature_importances = feature_importances,
        oof_predictions = oof_predictions,
#         masks = masks,
#         explain_matrices = explain_matrices,
    )
    
    return outputs

In [None]:
outputs = train(cudf_train, features, args, tabnet_params)
print('outputs:')
for k, v in outputs.items():
    print(f'  {k}: {type(v)}')

# Evaluation

## Score

## Feature importance (Top50)

## Mask

## Explain matrix

## True-Prediction correlation

# Submission

In [None]:
# def predict(models, df_test, features):
#     # df_test['time_id'] = df_test.row_id.str.extract(r'(\d+)_.*').astype(np.uint16) # extract time_id from row_id # remove.ver2
#     preds = []
#     for model in models:
#         pred = model.predict(df_test[features].values)
#         preds.append(pred)
        
#     mean_pred_by_folds = np.mean(np.stack(preds), axis=0)
#     return mean_pred_by_folds

# def load_trained_models():
#     model_paths = glob.glob('./tabnet_fold*.zip')
#     model =  TabNetRegressor(**tabnet_params)
#     models = []
#     for model_path in model_paths:    
#         model.load_model(model_path)
#         model_ = copy.deepcopy(model)
#         models.append(model_)
    
#     return models

In [None]:
# # Load model
# models = load_trained_models()

# # Make submission
# import ubiquant
# # env = ubiquant.make_env()
# # iter_test = env.iter_test()
# for (df_test, df_submission) in iter_test:
#     df_test, features = create_features(df_test)
#     df_submission['target'] = predict(models, df_test, features)
#     env.predict(df_submission) 

In [None]:
best_model =  TabNetRegressor()
best_model.load_model('./tabnet_fold1.zip')

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
for (df_test, df_submission) in iter_test:
    df_test, features = create_features(df_test)
    pred = best_model.predict(df_test[features].values)
    df_submission['target']=pred
    env.predict(df_submission) 

In [None]:
sub=pd.read_csv('./submission.csv')

In [None]:
sub.info()

In [None]:
sub

## Please upvoke, if useful for you