# How to build a basic classification model with Autogluon

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

## Prepare data

In [None]:
fps_file = "./data/phototox_fps.csv"
rdkit_file = "./data/phototox_rdkit.csv"
rdkit_fps_file = "./data/phototox_rdkit_fps.csv"

data = TabularDataset(data=rdkit_fps_file)
df_train, df_test = train_test_split(data,
                                     test_size=0.2,
                                     random_state=0,
                                     stratify=fps_data[['Value']]
                                     )

print("Counts of train classes:")
print(df_train.Value.value_counts())
print("Counts of test classes:")
print(df_test.Value.value_counts())

## Training

In [None]:
label_column = 'Value'
id_columns = ["Substance", "Canonical_Smiles", "Rating"]
save_path = './models/ag-model-binary-phototox'
learner_kwargs = {
    'positive_class': 1,
    'ignored_columns': id_columns
}
metric = 'roc_auc'

predictor = TabularPredictor(label=label_column, path=save_path, problem_type='binary',
                             eval_metric=metric, learner_kwargs=learner_kwargs)
predictor.fit(train_data=df_train,
              #time_limit=10,
              presets='best_quality' 
 )

In [None]:
results = predictor.fit_summary(verbosity=1)

## Inference

In [None]:
label_column = 'Value'
y_test = df_test[label_column]
df_test_nolabel = df_test.drop(columns=[label_column])

predictor = TabularPredictor.load("./models/ag-model-binary/")
y_pred = predictor.predict_proba(df_test_nolabel)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

In [None]:
# Predicted probabilities
print(y_pred)

In [None]:
df_leaderboard = predictor.leaderboard(df_test, silent=True)
df_leaderboard