# Build PIH classification model with Autogluon

In [1]:
import os
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Prepare data

In [2]:
rdkit_file = "./data/pih_rdkit.csv"
flatring_file = "./data/pih_flatring.csv"
fps_file = "./data/pih_fps.csv"
rdkit_fps_file = "./data/pih_rdkit_fps.csv"
flatring_fps_file = "./data/pih_flatring_fps.csv"
flatring_rdkit_fps_file = "./data/pih_flatring_rdkit_fps.csv"

# Select feature file
in_file = fps_file 
base = os.path.basename(in_file)
in_name = os.path.splitext(base)[0]

data = TabularDataset(data=in_file)
df_train = data[data.Set == "Train"].copy()
df_test = data[data.Set == "Test"].copy()

df_train.drop(columns=["Set"], inplace=True)
df_test.drop(columns=["Set"], inplace=True)

print("train dataset", df_train.shape)
print(df_train.Photosensitation.value_counts())
print("test dataset", df_test.shape)
print(df_test.Photosensitation.value_counts())

train dataset (996, 1034)
no     600
yes    396
Name: Photosensitation, dtype: int64
test dataset (306, 1034)
no     183
yes    123
Name: Photosensitation, dtype: int64


## Training

In [3]:
label_column = 'Photosensitation'
save_path = f"./models/ag-binary-model-{in_name}"
id_columns = ["Substance", "Canonical_Smiles"]

predictor = TabularPredictor(
    label=label_column,
    path=save_path,
    problem_type='binary',
    eval_metric='roc_auc',
    learner_kwargs={
        'positive_class': 'yes',
        'ignored_columns': id_columns
    },
)
predictor.fit(
    train_data=df_train,
    presets='best_quality',
    #auto_stack=True,
    verbosity=0,
)

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7ff5e8086890>

In [4]:
results = predictor.fit_summary(verbosity=1)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.729478       7.932170  222.414333                0.001377           1.295963            3       True         28
1     ExtraTreesGini_BAG_L2   0.728083       7.091479  205.047055                0.217727           0.826044            2       True         22
2       WeightedEnsemble_L2   0.722066       3.107671  132.034148                0.001089           1.364263            2       True         14
3     ExtraTreesEntr_BAG_L2   0.721879       7.097965  204.945586                0.224213           0.724575            2       True         23
4   RandomForestGini_BAG_L2   0.721425       7.092590  205.034567                0.218838           0.813556            2       True         19
5   RandomForestEntr_BAG_L2   0.720495       7.077029  204.971667         

## Inference

In [5]:
label_column = 'Photosensitation'

# Evaluate model on test data
y_pred = predictor.predict_proba(df_test.drop(columns=[label_column]))
y_true = df_test[label_column]
perf = predictor.evaluate_predictions(
    y_true=y_true,
    y_pred=y_pred,
    auxiliary_metrics=True,
    silent=True,
    detailed_report=True,
)
perf["dataset"] = in_name
perf
  

{'roc_auc': 0.8171398107423697,
 'accuracy': 0.7581699346405228,
 'balanced_accuracy': 0.7245101959216314,
 'mcc': 0.48804793706257626,
 'f1': 0.6476190476190476,
 'precision': 0.7816091954022989,
 'recall': 0.5528455284552846,
 'confusion_matrix':       no  yes
 no   164   19
 yes   55   68,
 'classification_report': {'no': {'precision': 0.7488584474885844,
   'recall': 0.8961748633879781,
   'f1-score': 0.8159203980099503,
   'support': 183},
  'yes': {'precision': 0.7816091954022989,
   'recall': 0.5528455284552846,
   'f1-score': 0.6476190476190476,
   'support': 123},
  'accuracy': 0.7581699346405228,
  'macro avg': {'precision': 0.7652338214454417,
   'recall': 0.7245101959216314,
   'f1-score': 0.731769722814499,
   'support': 306},
  'weighted avg': {'precision': 0.7620229638068422,
   'recall': 0.7581699346405228,
   'f1-score': 0.7482698552057641,
   'support': 306}},
 'dataset': 'pih_flatring_fps'}

In [10]:
df_confusion_matrix = perf["confusion_matrix"]
tp = df_confusion_matrix.iloc[0,0]
fn = df_confusion_matrix.iloc[0,1]
fp = df_confusion_matrix.iloc[1,0]
tn = df_confusion_matrix.iloc[1,1]
sensitivity = tp/(tp+fn)
specifity   = tn/(tn+fp)
print(f"Sensitivity: {sensitivity}")
print(f"Specifity  : {specifity}")
print("Confusion matrix")
(df_confusion_matrix.rename(columns={"yes": 'predicted positive','no': 'predicted negative'})
.rename({'yes': 'observed positive','no': 'observed negative'})
)

Sensitivity: 0.8961748633879781
Specifity  : 0.5528455284552846
Confusion matrix


Unnamed: 0,predicted negative,predicted positive
observed negative,164,19
observed positive,55,68


In [7]:
df_leaderboard = predictor.leaderboard(df_test, silent=True)
df_leaderboard

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr_BAG_L1,0.831734,0.705831,0.125345,0.209043,0.717152,0.125345,0.209043,0.717152,1,True,6
1,WeightedEnsemble_L2,0.826736,0.722066,5.123621,3.107671,132.034148,0.006348,0.001089,1.364263,2,True,14
2,ExtraTreesEntr_BAG_L1,0.826469,0.710894,0.116973,0.216543,0.731285,0.116973,0.216543,0.731285,1,True,9
3,CatBoost_BAG_L2,0.825003,0.708952,9.874318,7.007465,216.329121,0.220928,0.133713,12.10811,2,True,21
4,ExtraTreesEntr_BAG_L2,0.823404,0.721879,9.777328,7.097965,204.945586,0.123938,0.224213,0.724575,2,True,23
5,RandomForestGini_BAG_L1,0.823271,0.707883,0.127844,0.21927,0.815525,0.127844,0.21927,0.815525,1,True,5
6,WeightedEnsemble_L3,0.81714,0.729478,10.357458,7.93217,222.414333,0.006711,0.001377,1.295963,3,True,28
7,NeuralNetFastAI_BAG_L2,0.815318,0.668148,13.573646,9.844595,237.817552,3.920256,2.970843,33.596541,2,True,24
8,CatBoost_BAG_L1,0.814963,0.700476,0.161445,0.142692,19.367333,0.161445,0.142692,19.367333,1,True,7
9,ExtraTreesGini_BAG_L1,0.814807,0.704398,0.128863,0.223692,0.725974,0.128863,0.223692,0.725974,1,True,8


In [8]:
# df_importance = predictor.feature_importance(
#     data=df_test,
#     subsample_size=5000,
#     num_shuffle_sets=10,
# )


In [9]:
# df_importance