In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
import time
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

SEED = 42

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
DATASET = "amex_credit_card"

# Available scenarios: original, standardization, imbalance_handle
SCENARIO = "imbalance_handle"

## 1. Read dataset

In [None]:
# train_data_df = pd.read_csv(
#     "data/amex-default-prediction/train_data.csv"
# )

# train_label_df = pd.read_csv(
#     "data/amex-default-prediction/train_labels.csv"
# )

In [None]:
# train_data_df.shape, train_label_df.shape

In [None]:
# # Perform left join
# df = pd.merge(train_data_df, train_label_df, on='customer_ID', how='left')

In [None]:
# FEATURES = [i for i in df.columns if i != "target" and i != "customer_ID"]

# NUMERIC_FEATURES = [i for i in df.select_dtypes(include=['float64', 'int64']).columns if i != "target"]

# CATEGORICAL_FEATURES = [i for i in FEATURES if i not in NUMERIC_FEATURES]

# LABEL = "target"

In [None]:
# df[df.isna().any(axis=1)].shape

In [None]:
# # Handle null values
# fill_values = {}
# for i in NUMERIC_FEATURES:
#     fill_values[i] = 0
# for i in CATEGORICAL_FEATURES:
#     fill_values[i] = "NULL"

# # Fill null values in the DataFrame using the specified fill values
# df = df.fillna(fill_values)

In [None]:
# df[df.isna().any(axis=1)].shape

In [None]:
# # Due to lack of resources, downsampled to 300k, 10x compared to Taiwan dataset
# df = df.sample(n=300000, random_state=SEED)

In [None]:
# df.shape

In [None]:
# df.to_csv(f"data/X_y/{DATASET}/df.csv")

In [None]:
# X = df[FEATURES]
# y = df[LABEL]

## 2. Preprocess data

### Prepare numeric and categorical features

In [None]:
# from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler

In [None]:
# # Apply label encoder to object columns
# label_encoder = LabelEncoder()

# for col in CATEGORICAL_FEATURES:
#     X.loc[:, f"enc_{col}"] = label_encoder.fit_transform(X.loc[:, col])

In [None]:
# # Apply normalizer
# if SCENARIO in ("standardization", "imbalance_handle"):
#     scaler = StandardScaler()
    
#     # Normalize the numeric columns
#     X.loc[:, NUMERIC_FEATURES] = scaler.fit_transform(X.loc[:, NUMERIC_FEATURES])

### Split train-test

In [None]:
# # Split the data into training, validation, and testing sets: 70, 10, 20%
# X_train_val, X_test, y_train_val, y_test = train_test_split(
#     X, y, test_size=0.2, stratify=y, random_state=SEED
# )

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train_val, y_train_val, test_size=0.125, stratify=y_train_val, random_state=SEED
# )

In [None]:
# if SCENARIO in ("standardization", "imbalance_handle"):
#     X_train.to_csv(f"data/X_y/{DATASET}/scaled_X_train.csv")
#     y_train.to_csv(f"data/X_y/{DATASET}/scaled_y_train.csv")
    
#     X_val.to_csv(f"data/X_y/{DATASET}/scaled_X_val.csv")
#     y_val.to_csv(f"data/X_y/{DATASET}/scaled_y_val.csv")
    
#     X_test.to_csv(f"data/X_y/{DATASET}/scaled_X_test.csv")
#     y_test.to_csv(f"data/X_y/{DATASET}/scaled_y_test.csv")
    
# else:
#     X_train.to_csv(f"data/X_y/{DATASET}/X_train.csv")
#     y_train.to_csv(f"data/X_y/{DATASET}/y_train.csv")
    
#     X_val.to_csv(f"data/X_y/{DATASET}/X_val.csv")
#     y_val.to_csv(f"data/X_y/{DATASET}/y_val.csv")
    
#     X_test.to_csv(f"data/X_y/{DATASET}/X_test.csv")
#     y_test.to_csv(f"data/X_y/{DATASET}/y_test.csv")

In [None]:
if SCENARIO in ("standardization", "imbalance_handle"):
    X_train = pd.read_csv(f"data/X_y/{DATASET}/scaled_X_train.csv", index_col=0)
    X_val = pd.read_csv(f"data/X_y/{DATASET}/scaled_X_val.csv", index_col=0)
    X_test = pd.read_csv(f"data/X_y/{DATASET}/scaled_X_test.csv", index_col=0)
    
    y_train = pd.read_csv(f"data/X_y/{DATASET}/scaled_y_train.csv", index_col=0)
    y_val = pd.read_csv(f"data/X_y/{DATASET}/scaled_y_val.csv", index_col=0)
    y_test = pd.read_csv(f"data/X_y/{DATASET}/scaled_y_test.csv", index_col=0)

else:
    X_train = pd.read_csv(f"data/X_y/{DATASET}/X_train.csv", index_col=0)
    X_val = pd.read_csv(f"data/X_y/{DATASET}/X_val.csv", index_col=0)
    X_test = pd.read_csv(f"data/X_y/{DATASET}/X_test.csv", index_col=0)
    
    y_train = pd.read_csv(f"data/X_y/{DATASET}/y_train.csv", index_col=0)
    y_val = pd.read_csv(f"data/X_y/{DATASET}/y_val.csv", index_col=0)
    y_test = pd.read_csv(f"data/X_y/{DATASET}/y_test.csv", index_col=0)

In [None]:
# Print the shape of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print()

# There are imbalanced data in the y_train
print("Label Distribution in y_train:")
print(y_train.value_counts())

print("Label Distribution in y_val:")
print(y_val.value_counts())

print("Label Distribution in y_test:")
print(y_test.value_counts())

In [None]:
FEATURES = [i for i in X_train.columns if i != "target" and i != "customer_ID"]

NUMERIC_FEATURES = [i for i in X_train.select_dtypes(include=['float64', 'int64']).columns if i != "target"]

CATEGORICAL_FEATURES = [i for i in FEATURES if i not in NUMERIC_FEATURES]

LABEL = "target"

In [None]:
# Handle null values
fill_values = {}
for i in NUMERIC_FEATURES:
    fill_values[i] = 0
for i in CATEGORICAL_FEATURES:
    fill_values[i] = "NULL"

# Fill null values in the DataFrame using the specified fill values
X_train = X_train.fillna(fill_values)
X_val = X_val.fillna(fill_values)
X_test = X_test.fillna(fill_values)

In [None]:
# Check null values
X_train[X_train.isna().any(axis=1)].shape, X_val[X_val.isna().any(axis=1)].shape, X_test[X_test.isna().any(axis=1)].shape

In [None]:
len(FEATURES), len(NUMERIC_FEATURES), len(CATEGORICAL_FEATURES)

## 3. Modeling

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

import numpy as np
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, Dropout, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# For the imbalance handling
from sklearn.utils.class_weight import compute_class_weight

# Assuming train_labels contains the integer class labels of your training data.
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train[LABEL].values)

# Convert class_weights to a dictionary
class_weight_dict = {class_index: weight for class_index, weight in enumerate(class_weights)}

In [None]:
print(f"class_weight_dict: {class_weight_dict}")

### Scikit-learn

In [None]:
ENC_FEATURES = [f"enc_{i}" if i in CATEGORICAL_FEATURES else i for i in FEATURES]

Warning: this training process will take about 50-60 mins, due to the SVM.

In [None]:
# # Initialize the classifiers
# if SCENARIO == "imbalance_handle":
#     logistic_regression = LogisticRegression(class_weight=class_weight_dict)
#     decision_tree = DecisionTreeClassifier(class_weight=class_weight_dict)
#     random_forest = RandomForestClassifier(class_weight=class_weight_dict)
#     svm_classifier = SVC(class_weight=class_weight_dict)
# else:  
#     logistic_regression = LogisticRegression()
#     decision_tree = DecisionTreeClassifier()
#     random_forest = RandomForestClassifier()
#     svm_classifier = SVC()

# # Train the classifiers
# logistic_regression.fit(X_train[ENC_FEATURES], y_train)
# decision_tree.fit(X_train[ENC_FEATURES], y_train)
# random_forest.fit(X_train[ENC_FEATURES], y_train)
# svm_classifier.fit(X_train[ENC_FEATURES], y_train)

# print("Ok!")

In [None]:
# Load the existing classifiers
if SCENARIO == "imbalance_handle":
    logistic_regression = joblib.load(f"model/{DATASET}/classweight_logistic_regression.joblib")
    decision_tree = joblib.load(f"model/{DATASET}/classweight_decision_tree.joblib")
    random_forest = joblib.load(f"model/{DATASET}/classweight_random_forest.joblib")
    svm_classifier = joblib.load(f"model/{DATASET}/classweight_svm_classifier.joblib")
elif SCENARIO == "standardization":
    logistic_regression = joblib.load(f"model/{DATASET}/scaled_logistic_regression.joblib")
    decision_tree = joblib.load(f"model/{DATASET}/scaled_decision_tree.joblib")
    random_forest = joblib.load(f"model/{DATASET}/scaled_random_forest.joblib")
    svm_classifier = joblib.load(f"model/{DATASET}/scaled_svm_classifier.joblib")
else:  
    logistic_regression = joblib.load(f"model/{DATASET}/logistic_regression.joblib")
    decision_tree = joblib.load(f"model/{DATASET}/decision_tree.joblib")
    random_forest = joblib.load(f"model/{DATASET}/random_forest.joblib")
    svm_classifier = joblib.load(f"model/{DATASET}/svm_classifier.joblib")

In [None]:
# Evaluate on test data
models = {
    "Logistic Regression": logistic_regression,
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "SVM": svm_classifier
}

for model_name, model in models.items():
    y_pred = model.predict(X_test[ENC_FEATURES])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)

    print("Model:", model_name)
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-score: {f1:.3f}")
    print(f"ROC AUC: {roc_auc:.3f}")
    print(confusion_mat)
    print("-"*50)

In [None]:
# # Save the classifiers
# if SCENARIO == "imbalance_handle":
#     joblib.dump(logistic_regression, f"model/{DATASET}/classweight_logistic_regression.joblib")
#     joblib.dump(decision_tree, f"model/{DATASET}/classweight_decision_tree.joblib")
#     joblib.dump(random_forest, f"model/{DATASET}/classweight_random_forest.joblib")
#     joblib.dump(svm_classifier, f"model/{DATASET}/classweight_svm_classifier.joblib")
# elif SCENARIO == "standardization":
#     joblib.dump(logistic_regression, f"model/{DATASET}/scaled_logistic_regression.joblib")
#     joblib.dump(decision_tree, f"model/{DATASET}/scaled_decision_tree.joblib")
#     joblib.dump(random_forest, f"model/{DATASET}/scaled_random_forest.joblib")
#     joblib.dump(svm_classifier, f"model/{DATASET}/scaled_svm_classifier.joblib")
# else:  
#     joblib.dump(logistic_regression, f"model/{DATASET}/logistic_regression.joblib")
#     joblib.dump(decision_tree, f"model/{DATASET}/decision_tree.joblib")
#     joblib.dump(random_forest, f"model/{DATASET}/random_forest.joblib")
#     joblib.dump(svm_classifier, f"model/{DATASET}/svm_classifier.joblib")

### Deep-learning

#### Neural networks

In [None]:
# # Train Neural networks
# model = keras.Sequential()
# model.add(keras.layers.Dense(32, activation='relu', input_shape=(len(FEATURES),)))
# model.add(keras.layers.Dense(8, activation='relu'))
# model.add(keras.layers.Dense(1, activation='sigmoid'))

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# early_stopper = EarlyStopping(monitor="val_accuracy", mode="max", patience=5, restore_best_weights=True)

# if SCENARIO == "imbalance_handle":
#     history = model.fit(
#         X_train[ENC_FEATURES],
#         y_train, 
#         epochs=100, 
#         batch_size=1024,
#         validation_data=(X_val[ENC_FEATURES], y_val), 
#         callbacks=[early_stopper], 
#         class_weight=class_weight_dict
#     )
# else:
#     history = model.fit(
#         X_train[ENC_FEATURES],
#         y_train, 
#         epochs=100, 
#         batch_size=1024,
#         validation_data=(X_val[ENC_FEATURES], y_val), 
#         callbacks=[early_stopper]
#     )

# # Plot the loss history
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend(['Train', 'Validation'], loc='upper right')
# plt.show()

In [None]:
# Load the classifiers
if SCENARIO == "imbalance_handle":
    model = tf.keras.models.load_model(f"model/{DATASET}/classweight_neural_network.h5")
elif SCENARIO == "standardization":
    model = tf.keras.models.load_model(f"model/{DATASET}/scaled_neural_network.h5")
else:  
    model = tf.keras.models.load_model(f"model/{DATASET}/neural_network.h5")

In [None]:
predictions = model.predict(X_test[ENC_FEATURES])

# Define the threshold for binary classification
threshold = 0.5
y_pred = (predictions >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print(confusion_mat)

In [None]:
# # Save the classifiers
# if SCENARIO == "imbalance_handle":
#     model.save(f"model/{DATASET}/classweight_neural_network.h5")
# elif SCENARIO == "standardization":
#     model.save(f"model/{DATASET}/scaled_neural_network.h5")
# else:  
#     model.save(f"model/{DATASET}/neural_network.h5")

#### CNN

In [None]:
# # Train CNN
# cnn_model = Sequential()
# cnn_model.add(Conv1D(32, 3, activation='relu', input_shape=(len(FEATURES), 1)))
# cnn_model.add(MaxPooling1D(2))
# cnn_model.add(Flatten())
# cnn_model.add(Dense(128, activation='relu'))
# cnn_model.add(Dense(1, activation='sigmoid'))
# cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Convert the data to the required input shape for CNN (num_samples, num_features, num_channels)
# X_train_cnn = np.expand_dims(X_train[ENC_FEATURES], axis=2)
# X_val_cnn = np.expand_dims(X_val[ENC_FEATURES], axis=2)

# early_stopper = EarlyStopping(patience=5, restore_best_weights=True)

# if SCENARIO == "imbalance_handle":
#     history = cnn_model.fit(
#         X_train_cnn, y_train, 
#         epochs=100, 
#         batch_size=1024,
#         validation_data=(X_val_cnn, y_val), 
#         callbacks=[early_stopper],
#         class_weight=class_weight_dict
#     )
# else:
#     history = cnn_model.fit(
#         X_train_cnn, y_train, 
#         epochs=100, 
#         batch_size=1024,
#         validation_data=(X_val_cnn, y_val), 
#         callbacks=[early_stopper]
#     )

# # Plot the loss history
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend(['Train', 'Validation'], loc='upper right')
# plt.show()

In [None]:
# Load the classifiers
if SCENARIO == "imbalance_handle":
    cnn_model = tf.keras.models.load_model(f"model/{DATASET}/classweight_cnn.h5")
elif SCENARIO == "standardization":
    cnn_model = tf.keras.models.load_model(f"model/{DATASET}/scaled_cnn.h5")
else:  
    cnn_model = tf.keras.models.load_model(f"model/{DATASET}/cnn.h5")

In [None]:
# Evaluate the model on the test data
X_test_cnn = np.expand_dims(X_test[ENC_FEATURES], axis=2)
predictions = cnn_model.predict(X_test_cnn)

# Define the threshold for binary classification
threshold = 0.5
y_pred = (predictions >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print(confusion_mat)
print("-"*50)

In [None]:
# # Save the classifiers
# if SCENARIO == "imbalance_handle":
#     cnn_model.save(f"model/{DATASET}/classweight_cnn.h5")
# elif SCENARIO == "standardization":
#     cnn_model.save(f"model/{DATASET}/scaled_cnn.h5")
# else:  
#     cnn_model.save(f"model/{DATASET}/cnn.h5")

#### LSTM

In [None]:
# Prepare the input data for the LSTM model
input_dim = len(FEATURES)
input_shape = (input_dim, 1)  # Add an extra dimension for LSTM input

# Reshape the input data for LSTM
X_train_lstm = X_train[ENC_FEATURES].values.reshape((-1, input_dim, 1))
X_val_lstm = X_val[ENC_FEATURES].values.reshape((-1, input_dim, 1))

In [None]:
# # Train the LSTMs
# inputs = Input(shape=input_shape)
# lstm_layer = LSTM(32, activation='relu')(inputs)
# dropout_layer = Dropout(0.1)(lstm_layer)
# outputs = Dense(1, activation='sigmoid')(dropout_layer)
# lstm_model = Model(inputs=inputs, outputs=outputs)

# # Compile the model
# lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# early_stopper = EarlyStopping(patience=3, restore_best_weights=True)

# if SCENARIO == "imbalance_handle":
#     history = lstm_model.fit(
#         X_train_lstm, y_train, 
#         epochs=20, 
#         validation_data=(X_val_lstm, y_val), 
#         callbacks=[early_stopper],
#         class_weight=class_weight_dict
#     )
# else:
#     history = lstm_model.fit(
#         X_train_lstm, y_train, 
#         epochs=20, 
#         validation_data=(X_val_lstm, y_val), 
#         callbacks=[early_stopper]
#     )

# # Plot the loss history
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend(['Train', 'Validation'], loc='upper right')
# plt.show()

In [None]:
# Load the classifiers
if SCENARIO == "imbalance_handle":
    lstm_model = tf.keras.models.load_model(f"model/{DATASET}/classweight_lstm.h5")
elif SCENARIO == "standardization":
    lstm_model = tf.keras.models.load_model(f"model/{DATASET}/scaled_lstm.h5")
else:  
    lstm_model = tf.keras.models.load_model(f"model/{DATASET}/lstm.h5")

In [None]:
# Evaluate the model on the test data
X_test_lstm = X_test[ENC_FEATURES].values.reshape((-1, input_dim, 1))
predictions = lstm_model.predict(X_test_lstm)

# Define the threshold for binary classification
threshold = 0.5
y_pred = (predictions >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print(confusion_mat)
print("-"*50)

In [None]:
# # Save the classifiers
# if SCENARIO == "imbalance_handle":
#     lstm_model.save(f"model/{DATASET}/classweight_lstm.h5")
# elif SCENARIO == "standardization":
#     lstm_model.save(f"model/{DATASET}/scaled_lstm.h5")
# else:  
#     lstm_model.save(f"model/{DATASET}/lstm.h5")

### TabTransformer

In [None]:
import tensorflow_addons as tfa

from tabtransformertf.models.fttransformer import FTTransformerEncoder, FTTransformer
from tabtransformertf.models.tabtransformer import TabTransformer
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep

In [None]:
def df_to_dataset(
    dataframe: pd.DataFrame,
    target: str = None,
    shuffle: bool = True,
    batch_size: int = 512,
):
    df = dataframe.copy()
    if target:
        labels = df.pop(target)
        dataset = {}
        for key, value in df.items():
            dataset[key] = tf.expand_dims(value, axis=1) # Expand dimension similar to value[:, tf.newaxis]

        dataset = tf.data.Dataset.from_tensor_slices((dict(dataset), labels))
    else:
        dataset = {}
        for key, value in df.items():
            dataset[key] = tf.expand_dims(value, axis=1)

        dataset = tf.data.Dataset.from_tensor_slices(dict(dataset))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    return dataset

In [None]:
train_df = pd.concat([X_train[FEATURES], y_train], axis=1)
val_df = pd.concat([X_val[FEATURES], y_val], axis=1)
test_df = pd.concat([X_test[FEATURES], y_test], axis=1)

# Set data types
train_df[CATEGORICAL_FEATURES] = train_df[CATEGORICAL_FEATURES].astype(str)
val_df[CATEGORICAL_FEATURES] = val_df[CATEGORICAL_FEATURES].astype(str)
test_df[CATEGORICAL_FEATURES] = test_df[CATEGORICAL_FEATURES].astype(str)

train_df[NUMERIC_FEATURES] = train_df[NUMERIC_FEATURES].astype(float)
val_df[NUMERIC_FEATURES] = val_df[NUMERIC_FEATURES].astype(float)
test_df[NUMERIC_FEATURES] = test_df[NUMERIC_FEATURES].astype(float)

In [None]:
# To TF Dataset
train_dataset = df_to_dataset(train_df, LABEL, batch_size=1024, shuffle=False)
val_dataset = df_to_dataset(val_df, LABEL, batch_size=1024, shuffle=False) # No shuffle
test_dataset = df_to_dataset(test_df, LABEL, batch_size=1024, shuffle=False) # No shuffle

#### fttransformer

In [None]:
# # Train the tab transformer
# ft_linear_encoder = FTTransformerEncoder(
#     numerical_features=NUMERIC_FEATURES,
#     categorical_features=CATEGORICAL_FEATURES,
#     numerical_data=train_df[NUMERIC_FEATURES].values,
#     categorical_data=train_df[CATEGORICAL_FEATURES].values,
#     y=None,
#     numerical_embedding_type='linear',
#     embedding_dim=16,
#     depth=4,
#     heads=8,
#     attn_dropout=0.2,
#     ff_dropout=0.2,
#     explainable=True
# )

# # Pass the encoder to the model
# ft_linear_transformer = FTTransformer(
#     encoder=ft_linear_encoder,
#     out_dim=1,
#     out_activation='sigmoid',
# )

# ft_linear_transformer.compile(
#     optimizer="adam",
#     loss="binary_crossentropy",
#     metrics=["accuracy"]
# )

# early = EarlyStopping(monitor="val_output_loss", mode="min", patience=10, restore_best_weights=True)
# callback_list = [early]

# if SCENARIO == "imbalance_handle":
#     ft_linear_history = ft_linear_transformer.fit(
#         train_dataset, 
#         epochs=20, 
#         validation_data=val_dataset,
#         callbacks=callback_list,
#         class_weight=class_weight_dict
#     )
# else:
#     ft_linear_history = ft_linear_transformer.fit(
#         train_dataset, 
#         epochs=20, 
#         validation_data=val_dataset,
#         callbacks=callback_list
#     )


# # Plot the loss history
# plt.plot(ft_linear_history.history['output_accuracy'])
# plt.plot(ft_linear_history.history['val_output_accuracy'])
# plt.title('Model Acc')
# plt.xlabel('Epoch')
# plt.ylabel('Acc')
# plt.legend(['Train', 'Validation'], loc='upper right')
# plt.show()

In [None]:
# Load the classifiers
if SCENARIO == "imbalance_handle":
    ft_linear_transformer = tf.keras.models.load_model(f"model/{DATASET}/classweight_ft_linear_transformer")
elif SCENARIO == "standardization":
    ft_linear_transformer = tf.keras.models.load_model(f"model/{DATASET}/scaled_ft_linear_transformer")
else:  
    ft_linear_transformer = tf.keras.models.load_model(f"model/{DATASET}/ft_linear_transformer")

In [None]:
# Evaluate the model on the test data
predictions = ft_linear_transformer.predict(test_dataset)
y = y_test

# Define the threshold for binary classification
threshold = 0.5
y_pred = (predictions["output"] >= threshold).astype(int)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
roc_auc = roc_auc_score(y, y_pred)
confusion_mat = confusion_matrix(y, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print(confusion_mat)
print("-"*50)

In [None]:
# # Save the classifiers
# if SCENARIO == "imbalance_handle":
#     ft_linear_transformer.save(f"model/{DATASET}/classweight_ft_linear_transformer", save_format="tf")
# elif SCENARIO == "standardization":
#     ft_linear_transformer.save(f"model/{DATASET}/scaled_ft_linear_transformer", save_format="tf")
# else:  
#     ft_linear_transformer.save(f"model/{DATASET}/ft_linear_transformer", save_format="tf")