In [None]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
DATA_DIRECTORY = ""

In [None]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))

In [None]:
labels = labels.to_numpy()
test_id = test['SK_ID_CURR']
train_id = train['SK_ID_CURR']

In [None]:
train = train.drop(['SK_ID_CURR'], axis=1)
test = test.drop(['SK_ID_CURR'], axis=1)

In [None]:
imputer = SimpleImputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

In [None]:
scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [None]:
def model(features, test_features, labels, test_ids, n_folds = 5):
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 8888)
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    ratio = (labels == 0).sum()/ (labels == 1).sum()
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = XGBClassifier(
                  n_estimators=5000,
                  learning_rate=0.01,
                  max_depth=11,
                  objective='binary:logistic', 
                  gamma=0.098, 
                  subsample=0.708,
                  reg_alpha=3.564, 
                  reg_lambda=4.930,
                  random_state=8888,
                  seed=88888,
                  scale_pos_weight=ratio,
                  colsample_bytree= 0.613,
                  min_child_weight= 6,
                  tree_method='gpu_hist',
                  predictor='gpu_predictor'
                  )
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(train_features, train_labels), (valid_features, valid_labels)],
                  early_stopping_rounds = 2500, verbose = 1000)
        
        # Make predictions
        test_predictions += model.predict_proba(test_features)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features)[:, 1]
        
        evals_result = model.evals_result()
        valid_score = max(evals_result['validation_1']['auc'])
        train_score = max(evals_result['validation_0']['auc'])
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, metrics

In [None]:
submission, metrics = model(train, train, labels, train_id)

In [None]:
submission = submission['TARGET'].to_numpy()

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr, tpr, thresholds = roc_curve(labels, submission)
# Calculate the G-mean
gmean = np.sqrt(tpr * (1 - fpr))

# Find the optimal threshold
index = np.argmax(gmean)
bestThreshold = thresholds[index]

In [None]:
target = np.where(submission > bestThreshold, 1, 0)

In [None]:
del fpr, gmean, imputer, index, DATA_DIRECTORY, thresholds, tpr, submission, metrics, bestThreshold
gc.collect()

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

In [None]:
def model(features, test_features, labels, test_ids, n_folds = 5):
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 88)
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        # Create the model
        model = TabNetClassifier(
                    n_d=32, 
                    n_a=32, 
                    n_steps=10,
                    gamma=0.098, 
                    n_independent=2, 
                    n_shared=2,
                    lambda_sparse=1e-3, 
                    momentum=0.4, 
                    clip_value=2.,
                    optimizer_fn=torch.optim.Adam,
                    scheduler_params = {"gamma": 0.95,
                                    "step_size": 20},
                    optimizer_params=dict(lr=2e-2),
                    scheduler_fn=torch.optim.lr_scheduler.StepLR, 
                    epsilon=1e-15, verbose = 0,
                    device_name='cuda'
                )
        
        # Train the model
        model.fit(
            train_features, train_labels,
            eval_set=[(train_features, train_labels), (valid_features, valid_labels)],  
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs=1000 , patience=50,
            batch_size=1024, virtual_batch_size=128,
            num_workers=0,
            weights=1,
            drop_last=False
        )

        print(model)
        
        # Make predictions
        test_predictions += model.predict_proba(test_features)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features)[:, 1]

        # Record the best score
        valid_score = roc_auc_score(valid_labels, model.predict(valid_features))
        train_score = roc_auc_score(train_labels, model.predict(train_features))
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, metrics

In [None]:
submission, metrics = model(train, train, target, train_id)

In [None]:
print('TabNet metrics')
print(metrics)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
tn = submission['TARGET'].to_numpy().reshape(-1, 1)
scaler.fit(tn)
tg = scaler.transform(tn)

In [None]:
submission = pd.DataFrame({'SK_ID_CURR': test_id, 'TARGET': tg[:,0]})

In [None]:
submission.to_csv('xgboost-tabnet.csv', index = False)

In [None]:
pred = submission['TARGET'].to_numpy()
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(labels[:,0],  pred)
auc = roc_auc_score(labels[:,0], pred)
plt.plot(fpr,tpr,label="XGBoost-TabNet")
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  
plt.legend(loc=4)
plt.show()