In [None]:

import gc
import pandas as pd
import numpy as np
import sklearn as skl
import seaborn as sns
import torch
import scipy
import opendatasets as od
import math
import torchmetrics
import importlib
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import category_encoders
import pickle
import io
import yaml              

from matplotlib import pyplot as plt            
from sklearn.preprocessing import LabelEncoder  
from sklearn import metrics                     

import utils.mlp as mlp
import utils.mlp_pipeline as mlp_pipeline
import utils.embedding_pipeline as embedding_pipeine

from bank_account_fraud.notebooks.random_search import RandomValueTrial, suggest_callable_hyperparams  # from repository https://github.com/feedzai/bank-account-fraud.git

# Data Loading

In [None]:


class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using {device} device")


Using cuda device


In [None]:
od.download("https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022?select=Base.csv")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: ritaleite
Your Kaggle Key: ··········
Downloading bank-account-fraud-dataset-neurips-2022.zip to ./bank-account-fraud-dataset-neurips-2022


100%|██████████| 546M/546M [00:28<00:00, 20.3MB/s]





In [None]:
import glob
import pandas as pd

extension = "csv"  # or "parquet", depending on the downloaded file
data_paths = glob.glob(f"</path/to/datasets/>*.{extension}")

def read_dataset(path, ext=extension):
    if ext == "csv":
        return pd.read_csv(path, index_col=0)
    elif ext == "parquet":
        return pd.read_parquet(path)
    else:
        raise ValueError(f"Invalid extension: '{ext}'.")

def get_variant(path):
        return path.split("/")[-1].split(".")[0]

dataframes = {
    get_variant(path): read_dataset(path) for path in data_paths
}

In [None]:

# Read hyperparameter space for the LGBM Models, expected structure is presented bellow
with open("/content/gdrive/MyDrive/tab_norm_folder/bank_account_fraud/notebooks/lightgbm_hyperparameter_space.yaml", "r") as file:
    hyperparam_space = yaml.load(file, Loader=yaml.FullLoader)


# Define path to datasets. Replace `base_path` with the appropriate value.
base_path = "/content/bank-account-fraud-dataset-neurips-2022/"

datasets_paths = {
    "Base":    base_path + "Base.csv",
}

datasets = {key: pd.read_csv(path) for key, path in datasets_paths.items()}

In [None]:
print(datasets['Base'].describe())


# Create the train and test sets. Shuffle data with `sample` method.
# The split was done by month. The first 6 months as the train, the last 2 months as test.
train_dfs = {key: df[df["month"]<6].sample(frac=1, replace=False) for key, df in datasets.items()}
val_dfs = {key: df[df["month"]==6].sample(frac=1, replace=False) for key, df in datasets.items()}
test_dfs= {key: df[df["month"]==7].sample(frac=1, replace=False) for key, df in datasets.items()}

label = "fraud_bool"

categorical_features = [
    "payment_type",
    "employment_status",
    "housing_status",
    "source",
    "device_os",
]


for name in datasets.keys():  # For each dataset in the suite
    train = train_dfs[name]
    val=val_dfs[name]
    test = test_dfs[name]
    for feat in categorical_features:
      encoder = LabelEncoder()
      encoder.fit(train[feat])  # Fit an encoder to the train set.
      train[feat] = encoder.transform(train[feat])  # Transform train set.
      val[feat] = encoder.transform(val[feat])  # Transform val set.
      test[feat] = encoder.transform(test[feat])    # Transform test set.

for dataset_name in datasets.keys():  # Run hyperparameters on all variants of datastes.
        
      X_train = train_dfs[dataset_name].drop(columns=["fraud_bool"])
      y_train = train_dfs[dataset_name]["fraud_bool"]
      X_val = val_dfs[dataset_name].drop(columns=["fraud_bool"])
      y_val = val_dfs[dataset_name]["fraud_bool"]
      X_test=test_dfs[dataset_name].drop(columns=["fraud_bool"])
      y_test = test_dfs[dataset_name]["fraud_bool"]

           fraud_bool          income  name_email_similarity  \
count  1000000.000000  1000000.000000         1000000.000000   
mean         0.011029        0.562696               0.493694   
std          0.104438        0.290343               0.289125   
min          0.000000        0.100000               0.000001   
25%          0.000000        0.300000               0.225216   
50%          0.000000        0.600000               0.492153   
75%          0.000000        0.800000               0.755567   
max          1.000000        0.900000               0.999999   

       prev_address_months_count  current_address_months_count  \
count             1000000.000000                1000000.000000   
mean                   16.718568                     86.587867   
std                    44.046230                     88.406599   
min                    -1.000000                     -1.000000   
25%                    -1.000000                     19.000000   
50%                    -1.0

In [None]:
#Establish continuous and categorical features
cat_cols=categorical_features
cont_cols=train.columns.difference(cat_cols)
cont_cols=cont_cols.difference(['fraud_bool'])

# Embeddings
Defining Embeddings for Categorical Data

In [None]:

for name in datasets.keys():  # For each dataset in the suite
    train = train_dfs[name]
    val=val_dfs[name]
    test = test_dfs[name]

embedding_pipeline=importlib.reload(embedding_pipeline)
x_train,x_test,x_val=embedding_pipeline.aggregate_low_card_BAF(X_train,X_test,X_val,cat_cols)
dims=embedding_pipeline.get_emb_dim(x_train,'log',cat_cols)

dims=torch.tensor(dims,dtype=int).to(device)

import mlp
import mlp_pipeline

param_list=mlp.mlp_param_sampler(20, len(train.columns),7,device)


x_train_cont=x_train[cont_cols]
x_train_cat=x_train[cat_cols]
x_test_cont=x_test[cont_cols]
x_test_cat=x_test[cat_cols]
x_val_cont=x_val[cont_cols]
x_val_cat=x_val[cat_cols]


        payment_type  employment_status  housing_status  source  device_os
765570             1                  0               1       0          2
491426             1                  5               1       0          0
358679             2                  3               1       0          2
689347             0                  0               4       0          3
115017             2                  0               0       0          0
...              ...                ...             ...     ...        ...
768054             3                  2               1       0          0
389252             2                  0               4       0          2
963569             1                  2               0       0          4
799012             0                  0               4       0          0
159124             0                  0               2       0          2

[794989 rows x 5 columns]
payment_type
employment_status
housing_status
source
device_os


In [None]:
normalization='Zscore'
xtrain_aux_cont=mlp_pipeline.normalization_transform(x_train_cont,normalization,cont_cols)
xval_aux_cont=mlp_pipeline.normalization_transform(x_val_cont,normalization,cont_cols)

xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])


In [None]:

normalization='Zscore'
xtrain_aux_cont=mlp_pipeline.normalization_transform(x_train_cont,normalization,cont_cols)
xval_aux_cont=mlp_pipeline.normalization_transform(x_val_cont,normalization,cont_cols)

xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])
method='log'

dims=embedding_pipeline.get_emb_dim(x_train,method,cat_cols)

dims=torch.tensor(dims,dtype=int).to(device)

i=0

for params in param_list[i:]:

        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline(device,xtrain_aux_cat,xtrain_aux_cont,xval_aux_cat,xval_aux_cont,y_train,y_val,params,'log',dims)

        #Save the model
        joblib.dump(model,'tab_norm_folder/embedding/mlp_simple{}{}{}.pkl'.format(method,normalization,i))

        i=i+1

gc.collect()


In [None]:


embedding_pipeline=importlib.reload(embedding_pipeline)

normalization='Zscore'
xtrain_aux_cont=mlp_pipeline.normalization_transform(x_train_cont,normalization,cont_cols)
xval_aux_cont=mlp_pipeline.normalization_transform(x_val_cont,normalization,cont_cols)

xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])
method='sqrt'

dims=embedding_pipeline.get_emb_dim(x_train,method,cat_cols)

dims=torch.tensor(dims,dtype=int).to(device)

i=0

for params in param_list[i:]:

        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline(device,xtrain_aux_cat,xtrain_aux_cont,xval_aux_cat,xval_aux_cont,y_train,y_val,params,'log',dims)

        #Save the model
        joblib.dump(model,'tab_norm_folder/embedding/mlp_simple{}{}{}.pkl'.format(method,normalization,i))

        i=i+1

gc.collect()

In [None]:


embedding_pipeline=importlib.reload(embedding_pipeline)
i=15
for normalization in ['MinMax']:
  xtrain_aux_cont=mlp_pipeline.normalization_transform(x_train_cont,normalization,cont_cols)
  xval_aux_cont=mlp_pipeline.normalization_transform(x_val_cont,normalization,cont_cols)

  xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
  xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])
  method='sqrt'

  dims=embedding_pipeline.get_emb_dim(x_train,method,cat_cols)

  dims=torch.tensor(dims,dtype=int)


  for params in param_list[i:]:

        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline(device,xtrain_aux_cat,xtrain_aux_cont,xval_aux_cat,xval_aux_cont,y_train,y_val,params,'log',dims)

        #Save the model
        joblib.dump(model,'tab_norm_folder/embedding/mlp_simple{}{}{}.pkl'.format(method,normalization,i))

        i=i+1
  i=0
  del xtrain_aux_cat
  del xtrain_aux_cont
  del xval_aux_cat
  del xval_aux_cont
  gc.collect()

In [None]:

embedding_pipeline=importlib.reload(embedding_pipeline)
mlp_params=mlp.mlp_param_sampler(20, len(train.columns),7,device)


xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])
method='sqrt'

dims=embedding_pipeline.get_emb_dim(x_train,method,cat_cols)

dims=torch.tensor(dims,dtype=int).to(device)
encoder=data.PiecewiseLinearEncoder('decision_tree',dict(n_bins=10,regression='False', tree_kwargs={'min_samples_leaf': 128}),stack=False)
encoder.fit(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values),torch.tensor(y_train.values)) 
#the device count fraud feature is constant, and so cannot be fed to this encoder
x_train_cont=encoder.transform(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values))
x_val_cont=encoder.transform(torch.tensor(X_val[cont_cols.difference(['device_fraud_count'])].values))
i=0
del encoder
gc.collect()
for param_list in mlp_params[i:]:        

        
        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline(device,xtrain_aux_cat,x_train_cont,xval_aux_cat,x_val_cont,y_train,y_val,param_list,'sqrt',dims)

        #Save the model
        joblib.dump(model,'tab_norm_folder/embedding/mlp_num_enc10{}{}.pkl'.format(method,i))

        i=i+1

        gc.collect()

gc.collect()

payment_type
employment_status
housing_status
source
device_os
iteration:  0
----------Method: Embedding for Categorical sqrt -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 5 out of 5
Early stopping
iteration:  1
----------Method: Embedding for Categorical sqrt -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 5 out of 5
Early stopping
iteration:  2
----------Method: Embedding for Categorical sqrt 

0

In [None]:
#Attempting embeddings with dimension 1

embedding_pipeline=importlib.reload(embedding_pipeline)
i=10
for normalization in ['Median']:
  xtrain_aux_cont=mlp_pipeline.normalization_transform(x_train_cont,normalization,cont_cols)
  xval_aux_cont=mlp_pipeline.normalization_transform(x_val_cont,normalization,cont_cols)

  xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train_cat,'None',[])
  xval_aux_cat=mlp_pipeline.normalization_transform(x_val_cat,'None',[])


  dims=embedding_pipeline.get_emb_dim(x_train,'ones',cat_cols)

  dims=torch.tensor(dims,dtype=int)


  for params in param_list[i:]:

        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline(device,xtrain_aux_cat,xtrain_aux_cont,xval_aux_cat,xval_aux_cont,y_train,y_val,params,'dim=1',dims)

        #Save the model
        joblib.dump(model,'tab_norm_folder/embedding/mlp_simple{}{}{}.pkl'.format('1dimensional',normalization,i))

        i=i+1
  i=0
  del xtrain_aux_cat
  del xtrain_aux_cont
  del xval_aux_cat
  del xval_aux_cont
  gc.collect()

payment_type
employment_status
housing_status
source
device_os
iteration:  10
----------Method: Embedding for Categorical dim=1 -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 5 out of 5
Early stopping
iteration:  11
----------Method: Embedding for Categorical dim=1 -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 1 out of 5
EarlyS

# Numerical Encodings

In [None]:
#Code for the numerical embeddings from : https://github.com/Yura52/rtdl
import sys
sys.path.append("content/gdrive/MyDrive")

from rtdl.rtdl import data as data


In [None]:
link="tab_norm_folder/target_encoderBAF.pkl"
file = open(link,'rb')
encoder = joblib.load(link)
x_train_cat=encoder.transform(X_train)[cat_cols]
x_val_cat=encoder.transform(X_val)[cat_cols]
encoder=data.PiecewiseLinearEncoder('decision_tree',dict(n_bins=10,regression='False', tree_kwargs={'min_samples_leaf': 128}),stack=False)

#Fitting the encoder: some binary features are also encoded, this line will raise warning due to the amount of distinct values. 
encoder.fit(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values),torch.tensor(y_train.values)) #the device count fraud feature is constant, and so cannot be fed to this encoder
x_train_cont=encoder.transform(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values))
x_val_cont=encoder.transform(torch.tensor(X_val[cont_cols.difference(['device_fraud_count'])].values))  
x_train_aux=torch.cat([torch.tensor(x_train_cont),torch.tensor(x_train_cat.values)],1)
x_val_aux=torch.cat([torch.tensor(x_val_cont),torch.tensor(x_val_cat.values)],1)

joblib.dump(encoder,"tab_norm_folder/encoders/numerical_encoder_dim10.pkl")

del encoder
gc.collect()

mlp_params=mlp.mlp_param_sampler(20, len(train.columns),7,device)
i=15
for param_list in mlp_params[i:]:
    print(" Iteration:",i)

    model=mlp_pipeline.pipeline(device,x_train_aux,x_val_aux,y_train,y_val,param_list,'None',cont_cols.difference(['device_fraud_count']))
    joblib.dump(model,"tab_norm_folder/numerical_embeddings/num_enc_mlp_target{}_dim.pkl".format(i) )
    i=i+1



In [None]:
link="tab_norm_folder/cat_encoderBAF.pkl"
file = open(link,'rb')
encoder = joblib.load(link)
x_train_cat=encoder.transform(X_train)[cat_cols]
x_val_cat=encoder.transform(X_val)[cat_cols]
encoder=data.PiecewiseLinearEncoder('decision_tree',dict(n_bins=10,regression='False', tree_kwargs={'min_samples_leaf': 128}),stack=False)

#Fitting the encoder: some binary features are also encoded, this line will raise warning due to the amount of distinct values. 
encoder.fit(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values),torch.tensor(y_train.values)) #the device count fraud feature is constant, and so cannot be fed to this encoder
x_train_cont=encoder.transform(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values))
x_val_cont=encoder.transform(torch.tensor(X_val[cont_cols.difference(['device_fraud_count'])].values))  
x_train_aux=torch.cat([torch.tensor(x_train_cont),torch.tensor(x_train_cat.values)],1)
x_val_aux=torch.cat([torch.tensor(x_val_cont),torch.tensor(x_val_cat.values)],1)

joblib.dump(encoder,"tab_norm_folder/encoders/numerical_encoder_dim10.pkl")

del encoder
gc.collect()

mlp_params=mlp.mlp_param_sampler(20, len(train.columns),7,device)
i=15
for param_list in mlp_params[i:]:
    print(" Iteration:",i)

    model=mlp_pipeline.pipeline(device,x_train_aux,x_val_aux,y_train,y_val,param_list,'None',cont_cols.difference(['device_fraud_count']))
    joblib.dump(model,"tab_norm_folder/numerical_embeddings/num_enc_mlp_cat{}_dim.pkl".format(i) )
    i=i+1

In [None]:
link="tab_norm_folder/count_encoderBAF.pkl"
file = open(link,'rb')
encoder = joblib.load(link)
x_train_cat=encoder.transform(X_train)[cat_cols]
x_val_cat=encoder.transform(X_val)[cat_cols]
encoder=data.PiecewiseLinearEncoder('decision_tree',dict(n_bins=10,regression='False', tree_kwargs={'min_samples_leaf': 128}),stack=False)

#Fitting the encoder: some binary features are also encoded, this line will raise warning due to the amount of distinct values. 
encoder.fit(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values),torch.tensor(y_train.values)) #the device count fraud feature is constant, and so cannot be fed to this encoder
x_train_cont=encoder.transform(torch.tensor(X_train[cont_cols.difference(['device_fraud_count'])].values))
x_val_cont=encoder.transform(torch.tensor(X_val[cont_cols.difference(['device_fraud_count'])].values))  
x_train_aux=torch.cat([torch.tensor(x_train_cont),torch.tensor(x_train_cat.values)],1)
x_val_aux=torch.cat([torch.tensor(x_val_cont),torch.tensor(x_val_cat.values)],1)

joblib.dump(encoder,"tab_norm_folder/encoders/numerical_encoder_dim10.pkl")

del encoder
gc.collect()

mlp_params=mlp.mlp_param_sampler(20, len(train.columns),7,device)
i=16
for param_list in mlp_params[i:]:
    print(" Iteration:",i)

    model=mlp_pipeline.pipeline(device,x_train_aux,x_val_aux,y_train,y_val,param_list,'None',cont_cols.difference(['device_fraud_count']))
    joblib.dump(model,"/content/gdrive/MyDrive/tab_norm_folder/numerical_embeddings/num_enc_mlp_count{}_dim.pkl".format(i) )
    i=i+1


  x_train_aux=torch.cat([torch.tensor(x_train_cont),torch.tensor(x_train_cat.values)],1)
  x_val_aux=torch.cat([torch.tensor(x_val_cont),torch.tensor(x_val_cat.values)],1)


 Iteration: 16
----------Method:  None -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 5 out of 5
Early stopping
 Iteration: 17
----------Method:  None -------------
Train size:  794989 ; Number of 0:  786838 ; Number of 1: 8151
Val size:  108168 ; Number of 0:  106718 ; Number of 1: 1450
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping count

# TABTRANSFORMER

In [None]:
import utils.tabtransformer as tabtransformer

cat_cols=categorical_features
cont_cols=['bank_branch_count_8w', 'bank_months_count', 'credit_risk_score',
       'current_address_months_count', 'customer_age',
       'date_of_birth_distinct_emails_4w', 'days_since_request',
       'device_distinct_emails_8w', 'device_fraud_count', 'email_is_free',
       'foreign_request', 'has_other_cards', 'income',
       'intended_balcon_amount', 'keep_alive_session', 'month',
       'name_email_similarity', 'phone_home_valid', 'phone_mobile_valid',
       'prev_address_months_count', 'proposed_credit_limit',
       'session_length_in_minutes', 'velocity_24h', 'velocity_4w',
       'velocity_6h', 'zip_count_4w']

params=tabtransformer.tabtransformer_param_sampler(20,7,device)
X_train_norm=mlp_pipeline.z_score(X_train,cont_cols)
X_val_norm=mlp_pipeline.z_score(X_val,cont_cols)
i=18
for param in params[i:]:
  model=tabtransformer.transformer_pipeline(param,cat_cols,cont_cols,['fraud_bool'],X_train_norm,X_val_norm,y_train,y_val,'cpu')
  joblib.dump(model,"tab_norm_folder/tabtransformer/tabtransformer_zscore{}.pkl".format(i))
  i=i+1
  break


2023-02-05 21:07:33,027 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_lite.utilities.seed:Global seed set to 42
2023-02-05 21:07:33,132 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
2023-02-05 21:07:33,272 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
2023-02-05 21:07:36,707 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: TabTransformerModel
INFO:pytorch_tabular.tabular_model:Preparing the Model: TabTransformerModel
2023-02-05 21:07:36,772 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
INFO:pytorch_tabular.tabular_model:Preparing the Trainer
INFO:pytorch_lightning.utilities.rank_zero:GPU available:

Output()

2023-02-05 21:25:42,621 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
INFO:pytorch_tabular.tabular_model:Training the model completed
2023-02-05 21:25:42,626 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model
INFO:pytorch_tabular.tabular_model:Loading the best model


In [None]:
X_val_norm=mlp_pipeline.z_score(X_val,cont_cols)
X_test_norm=mlp_pipeline.z_score(X_test,cont_cols)


In [None]:
for i in range(0,8):
  model=CPU_Unpickler(open('tabtransformer/tabtransformer_zscore{}.pkl'.format(i),'rb')).load()
  predict=model.predict(X_val_norm)['1_probability']
  joblib.dump(predict,'res_prediction/yval{}.pkl'.format(i))
  predict=model.predict(X_test_norm)['1_probability']
  joblib.dump(predict,'res_prediction/yhat{}.pkl'.format(i))
  break