# Build a classification model with Autogluon

In [1]:
import os
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Prepare data

In [2]:
rdkit_file = "./data/piv_rdkit.csv"
flatring_file = "./data/piv_flatring.csv"
fps_file = "./data/piv_fps.csv"
rdkit_fps_file = "./data/piv_rdkit_fps.csv"
flatring_fps_file = "./data/piv_flatring_fps.csv"
flatring_rdkit_fps_file = "./data/piv_flatring_rdkit_fps.csv"

in_file = flatring_rdkit_fps_file 
base = os.path.basename(in_file)
in_name = os.path.splitext(base)[0]

data = TabularDataset(data=in_file)
df_train = data[data.Set == "Train"].copy()
df_test = data[data.Set == "Test"].copy()

df_train.drop(columns=["Set", "Rating"], inplace=True)
df_test.drop(columns=["Set", "Rating"], inplace=True)

print("train dataset", df_train.shape)
print(df_train.Value.value_counts())
print("test dataset", df_test.shape)
print(df_test.Value.value_counts())

train dataset (347, 1229)
0    205
1    142
Name: Value, dtype: int64
test dataset (100, 1229)
0    50
1    50
Name: Value, dtype: int64


## Training

In [3]:
label_column = 'Value'
save_path = f"./models/ag-binary-model-{in_name}"
id_columns = ["Substance", "Canonical_Smiles"]

predictor = TabularPredictor(
    label=label_column,
    path=save_path,
    problem_type='binary',
    eval_metric='roc_auc',
    learner_kwargs={
        'positive_class': 1,
        'ignored_columns': id_columns
    },
)
predictor.fit(
    train_data=df_train,
    presets='best_quality',
    #auto_stack=True,
    verbosity=0,
)

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7ffb013e9450>

In [4]:
results = predictor.fit_summary(verbosity=1)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.835520       1.753232  43.984796                0.000900           1.182164            2       True         14
1   RandomForestGini_BAG_L1   0.814188       0.107207   0.867380                0.107207           0.867380            1       True          5
2   RandomForestEntr_BAG_L1   0.813105       0.124419   0.765953                0.124419           0.765953            1       True          6
3     ExtraTreesEntr_BAG_L1   0.812178       0.112353   0.748501                0.112353           0.748501            1       True          9
4     ExtraTreesGini_BAG_L1   0.809498       0.107384   0.744062                0.107384           0.744062            1       True          8
5         LightGBMXT_BAG_L1   0.805909       0.045587   3.933429                

## Inference

In [5]:
label_column = 'Value'

# Evaluate model on test data
y_pred = predictor.predict_proba(df_test.drop(columns=[label_column]))
y_true = df_test[label_column]
perf = predictor.evaluate_predictions(
    y_true=y_true,
    y_pred=y_pred,
    auxiliary_metrics=True,
    silent=True,
    detailed_report=True,
)
perf["dataset"] = in_name
perf
  

{'roc_auc': 0.7879999999999999,
 'accuracy': 0.7,
 'balanced_accuracy': 0.7,
 'mcc': 0.41204282171516454,
 'f1': 0.6590909090909091,
 'precision': 0.7631578947368421,
 'recall': 0.58,
 'confusion_matrix':     0   1
 0  41   9
 1  21  29,
 'classification_report': {'0': {'precision': 0.6612903225806451,
   'recall': 0.82,
   'f1-score': 0.7321428571428572,
   'support': 50},
  '1': {'precision': 0.7631578947368421,
   'recall': 0.58,
   'f1-score': 0.6590909090909091,
   'support': 50},
  'accuracy': 0.7,
  'macro avg': {'precision': 0.7122241086587436,
   'recall': 0.7,
   'f1-score': 0.6956168831168832,
   'support': 100},
  'weighted avg': {'precision': 0.7122241086587436,
   'recall': 0.7,
   'f1-score': 0.6956168831168832,
   'support': 100}},
 'dataset': 'piv_flatring_rdkit_fps'}

In [6]:
df_leaderboard = predictor.leaderboard(df_test, silent=True)
df_leaderboard

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr_BAG_L1,0.7934,0.812178,0.11696,0.112353,0.748501,0.11696,0.112353,0.748501,1,True,9
1,WeightedEnsemble_L2,0.788,0.83552,2.028779,1.753232,43.984796,0.006578,0.0009,1.182164,2,True,14
2,XGBoost_BAG_L1,0.7844,0.757712,0.130005,0.043039,3.255929,0.130005,0.043039,3.255929,1,True,11
3,ExtraTreesGini_BAG_L1,0.7798,0.809498,0.120517,0.107384,0.744062,0.120517,0.107384,0.744062,1,True,8
4,NeuralNetMXNet_BAG_L1,0.7792,0.795706,1.386051,1.170837,25.211353,1.386051,1.170837,25.211353,1,True,12
5,RandomForestEntr_BAG_L1,0.777,0.813105,0.120036,0.124419,0.765953,0.120036,0.124419,0.765953,1,True,6
6,CatBoost_BAG_L1,0.7732,0.80213,0.1188,0.085547,11.138315,0.1188,0.085547,11.138315,1,True,7
7,LightGBM_BAG_L1,0.7724,0.802783,0.04421,0.044584,3.327068,0.04421,0.044584,3.327068,1,True,4
8,LightGBMLarge_BAG_L1,0.764,0.771865,0.048028,0.050031,10.707973,0.048028,0.050031,10.707973,1,True,13
9,RandomForestGini_BAG_L1,0.7596,0.814188,0.115627,0.107207,0.86738,0.115627,0.107207,0.86738,1,True,5


In [7]:
df_importance = predictor.feature_importance(
    data=df_test,
    subsample_size=5000,
    num_shuffle_sets=10,
)


KeyboardInterrupt: 

In [None]:
df_importance