# Build PIV classification model with Autogluon

In [None]:
import os
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Prepare data

In [None]:
rdkit_file = "./data/piv_rdkit.csv"
flatring_file = "./data/piv_flatring.csv"
fps_file = "./data/piv_fps.csv"
rdkit_fps_file = "./data/piv_rdkit_fps.csv"
flatring_fps_file = "./data/piv_flatring_fps.csv"
flatring_rdkit_fps_file = "./data/piv_flatring_rdkit_fps.csv"

# Select input file
in_file = fps_file 
base = os.path.basename(in_file)
in_name = os.path.splitext(base)[0]

data = TabularDataset(data=in_file)
df_train = data[data.Set == "Train"].copy()
df_test = data[data.Set == "Test"].copy()

df_train.drop(columns=["Set", "Rating"], inplace=True)
df_test.drop(columns=["Set", "Rating"], inplace=True)

print("train dataset", df_train.shape)
print(df_train.Value.value_counts())
print("test dataset", df_test.shape)
print(df_test.Value.value_counts())

## Training

In [None]:
label_column = 'Value'
save_path = f"./models/ag-binary-model-{in_name}"
id_columns = ["Substance", "Canonical_Smiles"]

predictor = TabularPredictor(
    label=label_column,
    path=save_path,
    problem_type='binary',
    eval_metric='roc_auc',
    learner_kwargs={
        'positive_class': 1,
        'ignored_columns': id_columns
    },
)
predictor.fit(
    train_data=df_train,
    presets='best_quality',
    #auto_stack=True,
    verbosity=0,
)

In [None]:
results = predictor.fit_summary(verbosity=1)

## Inference

In [None]:
label_column = 'Value'

# Evaluate model on test data
y_pred = predictor.predict_proba(df_test.drop(columns=[label_column]))
y_true = df_test[label_column]
perf = predictor.evaluate_predictions(
    y_true=y_true,
    y_pred=y_pred,
    auxiliary_metrics=True,
    silent=True,
    detailed_report=True,
)
perf["dataset"] = in_name
perf
  

In [None]:
df_confusion_matrix = perf["confusion_matrix"]
tp = df_confusion_matrix.iloc[0,0]
fn = df_confusion_matrix.iloc[0,1]
fp = df_confusion_matrix.iloc[1,0]
tn = df_confusion_matrix.iloc[1,1]
sensitivity = tp/(tp+fn)
specifity   = tn/(tn+fp)
print(f"Sensitivity: {sensitivity}")
print(f"Specifity  : {specifity}")
print("Confusion matrix")
(df_confusion_matrix.rename(columns={0: 'predicted positive',1: 'predicted negative'})
.rename({0: 'observed positive',1: 'observed negative'})
)

In [None]:
df_leaderboard = predictor.leaderboard(df_test, silent=True)
df_leaderboard

In [None]:
# df_importance = predictor.feature_importance(
#     data=df_test,
#     subsample_size=5000,
#     num_shuffle_sets=10,
# )


In [None]:
# df_importance