# Build PIH classification model with Autogluon

In [1]:
import os
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Prepare data

In [2]:
# in_file = "./data/pih_rdkit_fps.csv"
# in_file = "./data/pih_flatring_fps.csv"
# in_file = "./data/pih_flatring_rdkit_fps.csv"
# in_file = "./data/pih_cats.csv"
in_file = "./data/pih_fps.csv"

# Get basename without extension
base = os.path.basename(in_file)
in_name = os.path.splitext(base)[0]

data = TabularDataset(data=in_file)
df_train = data[data.Set == "Train"].copy()
df_test = data[data.Set == "Test"].copy()
df_ext = data[data.Set == "Ext"].copy()

df_train.drop(columns=["Set"], inplace=True)
df_test.drop(columns=["Set"], inplace=True)
df_ext.drop(columns=["Set"], inplace=True)

id_columns = ["Substance", "Canonical_Smiles"]
num_features = df_train.shape[1] - len(id_columns) - 1

print('-'*30)
print(f"#Features: {num_features}")
print("Train data", df_train.shape)
print(df_train.Photosensitation.value_counts())
print("Test data", df_test.shape)
print(df_test.Photosensitation.value_counts())
print("External data", df_ext.shape)
print(df_ext.Photosensitation.value_counts())

------------------------------
#Features: 1025
Train data (998, 1027)
no     602
yes    396
Name: Photosensitation, dtype: int64
Test data (306, 1027)
no     183
yes    123
Name: Photosensitation, dtype: int64
External data (104, 1027)
yes    66
no     38
Name: Photosensitation, dtype: int64


## Training

In [None]:
label_column = 'Photosensitation'
save_path = f"./models/ag-binary-model-{in_name}"

predictor = TabularPredictor(
    label=label_column,
    path=save_path,
    problem_type='binary',
    eval_metric='accuracy',
    learner_kwargs={
        'positive_class': 'yes',
        'ignored_columns': id_columns
    },
)

hyperparameters = {
    # "NN": {},
    # "GBM": [
    #     {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
    #     {},
    #     "GBMLarge",
    # ],
    "CAT": {},
    "XGB": {},
    # "FASTAI": {},
    "RF": [
        {"criterion": "gini"},
        {"criterion": "entropy"},
    ],
    "XT": [
        {"criterion": "gini"},
        {"criterion": "entropy"},
    ],
}

predictor.fit(
    train_data=df_train,
    #time_limit=600,
    presets='best_quality',
    #hyperparameters=hyperparameters,
    verbosity=2,
    num_bag_folds=5,
    num_bag_sets=20
)

In [None]:
results = predictor.fit_summary(verbosity=1)

## Inference

In [None]:
output = predictor.evaluate(df_test)
print(f"{predictor.eval_metric.name}: {output[predictor.eval_metric.name]}")
#output

In [None]:
label_column = 'Photosensitation'

# Evaluate model on test data
y_pred = predictor.predict_proba(df_test.drop(columns=[label_column]))
y_true = df_test[label_column]
perf = predictor.evaluate_predictions(
    y_true=y_true,
    y_pred=y_pred,
    auxiliary_metrics=True,
    silent=True,
    detailed_report=True,
)
perf["dataset"] = in_name
print('Performance of model on test data')
perf
  

In [None]:
def calc_sensitivity(df):
    """df contains the confusion_matrix
    as pandas DataFrame
    """
    tp = df.loc["yes", "yes"]
    fn = df.loc["no", "yes"]
    return tp / (tp + fn)


def calc_specifity(df):
    """df contains the confusion_matrix
    as pandas DataFrame
    """
    fp = df.loc["yes", "no"]
    tn = df.loc["no", "no"]
    return tn / (tn + fp)

confusion_matrix = perf["confusion_matrix"]
sensitivity = calc_sensitivity(confusion_matrix)
specifity = calc_specifity(confusion_matrix)
print(f"Sensitivity: {sensitivity:.4}")
print(f"Specifity  : {specifity:.4}")

print("Confusion matrix")
cm = (confusion_matrix.rename(columns={"yes": 'predicted positive','no': 'predicted negative'})
.rename({'yes': 'observed positive','no': 'observed negative'})
)
cm

In [None]:
df_leaderboard = predictor.leaderboard(df_test, silent=True)
df_leaderboard

In [None]:
test_data_transformed = predictor.transform_features(df_test)
test_data_transformed

In [None]:
# df_importance = predictor.feature_importance(
#     data=df_test,
#     # subsample_size=5000,
#     # num_shuffle_sets=10,
# )


In [None]:
# df_importance