In [1]:
%%capture
!pip install pytorch-tabnet

This is a demo notebook that provides an example of how to classify between competitive and cooperative environments. In this notebook we use the [TabNet model](https://pypi.org/project/pytorch-tabnet/).

This is a binary classification task, where 1 is a label for facultative cooperation and 0 is a label for competition.

Data is stored in $\texttt{.csv}$ fromat as a table, where chemical compounds are assigned to the column names and rows represent the environment.

In [2]:
import numpy as np
import pandas as pd
import os
import shutil
import joblib
import pickle
import json

# ml frameworks
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification # for test of funcs
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    precision_recall_curve,
    auc,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)

import torch
#time management
from tqdm import tqdm
import time

#stats
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import cdist
from itertools import combinations
#download data from hub
from huggingface_hub import hf_hub_download

In [3]:
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.callbacks import EarlyStopping

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


Downloading the data from our [HugginFace repo](https://huggingface.co/datasets/powidla/Friend-Or-Foe).

In [17]:
REPO_ID = "powidla/Friend-Or-Foe"

# File paths within the repo
X_train_ID = "Classification/AGORA/100/BC-I/X_train_BC-I-100.csv"
X_val_ID = "Classification/AGORA/100/BC-I/X_val_BC-I-100.csv"
X_test_ID = "Classification/AGORA/100/BC-I/X_test_BC-I-100.csv"

y_train_ID = "Classification/AGORA/100/BC-I/y_train_BC-I-100.csv"
y_val_ID = "Classification/AGORA/100/BC-I/y_val_BC-I-100.csv"
y_test_ID = "Classification/AGORA/100/BC-I/y_test_BC-I-100.csv"

# Download and load CSVs as pandas DataFrames
X_train = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=X_train_ID, repo_type="dataset"))
X_val = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=X_val_ID, repo_type="dataset"))
X_test = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=X_test_ID, repo_type="dataset"))

y_train = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=y_train_ID, repo_type="dataset"))
y_val = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=y_val_ID, repo_type="dataset"))
y_test = pd.read_csv(hf_hub_download(repo_id=REPO_ID, filename=y_test_ID, repo_type="dataset"))

Printing top 10 environments from dataframe

In [None]:
X_train.head(10)

The abbreviation names of compounds play role of column names

In [None]:
X_train.columns.values

Extract values from DataFrame and fix shape of labels

In [18]:
X_train = X_train.values
X_val = X_val.values
X_test = X_test.values

y_train = y_train.values.reshape(-1)
y_val = y_val.values.reshape(-1)
y_test = y_test.values.reshape(-1)

In [6]:
def create_confusion_matrix(y_true, y_pred):
    '''
    Description: Create a confusion matrix.
    Arguments: y_true (array-like): Ground truth labels;
               y_pred (array-like): Predicted labels.
    Outputs:
        pd.DataFrame: A confusion matrix as a pandas DataFrame.
    '''
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(cm, index=["True Negative", "True Positive"],
                             columns=["Predicted Negative", "Predicted Positive"])
    return cm_df


def score_metrics(y_true, y_pred, y_prob):
    '''
    Description: Calculate various metrics for binary classification.
    Arguments: y_true (array-like): Ground truth labels;
               y_pred (array-like): Predicted labels;
               y_prob (array-like): Predicted probabilities for the positive class.
    Outputs:
        dict
    '''
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "ROC AUC": roc_auc_score(y_true, y_prob),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "MCC": matthews_corrcoef(y_true, y_pred),
    }
    # PR AUC
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    metrics["PR AUC"] = auc(recall, precision)
    return metrics


def train_and_evaluate_tabnet(X_train, y_train, X_val, y_val, X_test, y_test,
                             output_dir="tabnet_results", seed=4221,
                             max_epochs=100, patience=10):
    '''
    Description: Train and evaluate TabNet on binclass.

    Arguments:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        X_test, y_test: Test data
        output_dir: Directory to save results
        seed: Random seed

    Outputs:
        json
    '''

    os.makedirs(output_dir, exist_ok=True)

    # Init
    clf = TabNetClassifier(
        cat_idxs=[],
        cat_dims=[],
        cat_emb_dim=1,
        optimizer_fn=torch.optim.AdamW,
        optimizer_params=dict(lr=1e-4, weight_decay=0.02),
        scheduler_params={"step_size":50, "gamma":0.99},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax',
        n_d=64,
        n_a=64,
        seed=seed,
        device_name=device
    )

    # Train
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=['accuracy'],
        max_epochs=max_epochs,
        patience=patience,
        batch_size=1024
    )

    # Test
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    with open(os.path.join(output_dir, "tabnet_metrics.json"), 'w') as f:
        json.dump(metrics, f, indent=4)

    clf.save_model(os.path.join(output_dir, "tabnet_model.zip"))

    print(f"\nTest Metrics:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"MCC: {metrics['MCC']:.4f}")

    return metrics

Running the baseline TabNet model and scoring accuracy. Final output is $\texttt{.json}$ file with metrics for binclass

In [20]:
train_and_evaluate_tabnet(X_train, y_train, X_val, y_val, X_test, y_test)



epoch 0  | loss: 1.79555 | val_0_accuracy: 0.57577 |  0:00:56s
epoch 1  | loss: 1.01109 | val_0_accuracy: 0.644   |  0:01:54s
epoch 2  | loss: 0.85879 | val_0_accuracy: 0.65267 |  0:02:53s
epoch 3  | loss: 0.81467 | val_0_accuracy: 0.65717 |  0:03:55s
epoch 4  | loss: 0.78939 | val_0_accuracy: 0.65788 |  0:04:55s
epoch 5  | loss: 0.769   | val_0_accuracy: 0.65998 |  0:05:52s
epoch 6  | loss: 0.7535  | val_0_accuracy: 0.66097 |  0:06:54s
epoch 7  | loss: 0.7441  | val_0_accuracy: 0.66209 |  0:07:55s
epoch 8  | loss: 0.73    | val_0_accuracy: 0.66402 |  0:08:58s
epoch 9  | loss: 0.72586 | val_0_accuracy: 0.66664 |  0:09:58s
epoch 10 | loss: 0.71568 | val_0_accuracy: 0.66868 |  0:10:54s
epoch 11 | loss: 0.70933 | val_0_accuracy: 0.66983 |  0:11:52s
epoch 12 | loss: 0.70222 | val_0_accuracy: 0.66637 |  0:12:48s
epoch 13 | loss: 0.69702 | val_0_accuracy: 0.66735 |  0:13:43s
epoch 14 | loss: 0.69188 | val_0_accuracy: 0.66984 |  0:14:39s
epoch 15 | loss: 0.68661 | val_0_accuracy: 0.67067 |  0



Successfully saved model at tabnet_results/tabnet_model.zip.zip

Test Metrics:
Accuracy: 0.6962
MCC: 0.1901


{'Accuracy': 0.6962477209003018,
 'Precision': 0.7013915379894847,
 'ROC AUC': np.float64(0.6500135335647999),
 'Recall': 0.9587168442530039,
 'F1': 0.8101108205702902,
 'MCC': np.float64(0.19006778546753422)}