# feature generation

In [None]:
from padelpy import padeldescriptor
from mordred import Calculator, descriptors
import pandas as pd
from rdkit import Chem

sdf_dir = "path to SDF or SMILES files"

# PaDEL
padeldescriptor(
    d_2d=True,
    d_3d=True,
    detectaromaticity=True,
    maxruntime=120000,
    mol_dir=sdf_dir,
    d_file="PaDEL.csv",
    retain3d=True,
    usefilenameasmolname=True,
    headless=True,
)
df_PaDEL = pd.read_csv("PaDEL.csv", index_col=0)
df_PaDEL = df_PaDEL.add_prefix("p_")


# mordred
calc = Calculator(descriptors, ignore_3D=False)
files = list(Path(sdf_dir).glob("*.sdf"))
mols = [Chem.MolFromMolFile(str(f)) for f in files]
IDs = [f.stem for f in files]
df = calc.pandas(mols)
df = df.apply(pd.to_numeric, errors="coerce")
df.index = IDs
df_mordred = df.add_prefix("m_")

# concatenation of all features and preprocessing/normalisation

# model training

In [None]:
import itertools
import numpy as np
import pandas as pd
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from prototorch import initializers, models
from prototorch.datasets.abstract import NumpyDataset
from pytorch_lightning import Trainer
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split


# BEFORE: split data into test, training and validation subsets
# according to the ADMETLab2.0 data or randomly
if randomCV:
    x_train, x_test, y_train, y_test = train_test_split(
        np.array(data),
        targets,
        test_size=test_size, # set to similar values like in ADMETLab's original CV
        random_state=random_state,
        shuffle=True,
        stratify=targets,
    )

    x_train, x_valid, y_train, y_valid = train_test_split(
        x_train,
        y_train,
        test_size=valid_size, # set to similar values like in ADMETLab's original CV
        random_state=random_state,
        shuffle=True,
    )


train_ds = NumpyDataset(x_train, y_train)
val_ds = NumpyDataset(x_val, y_val)
train_loader = DataLoader(train_ds)
val_loader = DataLoader(val_ds)

# Hyperparameters
hparams = dict(
    input_dim=train_ds.data.shape[1],
    latent_dim=___,
    distribution={
        "num_classes": ___,
        "per_class": ___,
    },
)

# Initialize the model
model = models.GMLVQ(
    hparams,
    optimizer=Adam,
    prototypes_initializer=initializers.SMCI(train_ds),
    omega_initializer=initializers.PCALTI(train_ds.data),
)

trainer_kwargs = {} # for device and logging settings
trainer = Trainer(**trainer_kwargs)

# Training loop
trainer.fit(model, train_loader, val_loader)

# test

In [None]:
# ...continues
preds = model.predict(torch.Tensor(np.array(x_test))).tolist()
auc = roc_auc_score(y_test, preds)
acc = accuracy_score(y_test, preds)
mcc = matthews_corrcoef(y_test, preds)
cm = confusion_matrix(y_test, preds)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (fp + tn)
sensitivity = tp / (tp + fn)