## Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib
import pickle

from sklearn.metrics import classification_report

In [2]:
#!pip install mljar-supervised
from supervised.automl import AutoML

## Export data

In [4]:
with open('Saved data/X_train.pickle', 'rb') as data:
    X_train = pickle.load(data)
    
with open('Saved data/X_test.pickle', 'rb') as data:
    X_test = pickle.load(data)

with open('Saved data/y_train.pickle', 'rb') as data:
    y_train = pickle.load(data)
    
with open('Saved data/y_test.pickle', 'rb') as data:
    y_test = pickle.load(data)
    
with open('Saved data/labels.pickle', 'rb') as data:
    labels = pickle.load(data)

## AutoML compete mode 

In [4]:
automl = AutoML(mode="Compete")
automl.fit(X_train.to_numpy(), y_train.to_numpy().flatten())

Linear algorithm was disabled.
Disable stacking for split validation
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'LightGBM', 'Neural Network', 'Nearest Neighbors']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree logloss 0.59591 trained in 813.44 seconds
2_DecisionTree logloss 0.652404 trained in 860.55 seconds
3_DecisionTree logloss 0.652404 trained in 808.13 seconds
Skip default_algorithms because of the time limit.
Skip not_so_random because of the time limit.
Skip golden_features because no parameters were generated.
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were genera

NameError: name 'X_test' is not defined

In [5]:
automlPredictions = automl.predict_all(X_test.to_numpy())
print(classification_report(y_test, automlPredictions['label'].to_numpy().flatten(), target_names=labels))

              precision    recall  f1-score   support

       Basal       0.95      0.90      0.92        20
        Her2       0.87      0.59      0.70        22
        LumA       0.83      0.96      0.89        68
        LumB       0.82      0.78      0.80        46

    accuracy                           0.85       156
   macro avg       0.87      0.81      0.83       156
weighted avg       0.85      0.85      0.84       156



## AutoML compete mode advanced

In [4]:
automlCompete = AutoML(
    mode="Compete",
    total_time_limit=3*3600,
    algorithms=["LightGBM", "Xgboost", "Extra Trees"],
    stack_models=True,
    train_ensemble=True,
    boost_on_errors=True,
    optuna_verbose=True,
    explain_level=2,
    ml_task = 'multiclass_classification',
)
automlCompete.fit(X_train.to_numpy(), y_train.to_numpy().flatten())

Disable stacking for split validation
AutoML directory: AutoML_3
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['LightGBM', 'Xgboost', 'Extra Trees']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_LightGBM logloss 0.462574 trained in 3580.5 seconds
* Step not_so_random will try to check up to 27 models
11_LightGBM logloss 0.35717 trained in 2746.73 seconds
There was an error during 2_Xgboost training.
Please check AutoML_3\errors.md for details.
Skip golden_features because no parameters were generated.
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
*

AutoML(algorithms=['LightGBM', 'Xgboost', 'Extra Trees'], boost_on_errors=True,
       explain_level=2, ml_task='multiclass_classification', mode='Compete',
       stack_models=True, total_time_limit=10800)

In [5]:
automlCompetePredictions = automlCompete.predict_all(X_test.to_numpy())
print(classification_report(y_test, automlCompetePredictions['label'].to_numpy().flatten(), target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.86      0.82      0.84        22
        LumA       0.93      1.00      0.96        68
        LumB       0.93      0.87      0.90        46

    accuracy                           0.93       156
   macro avg       0.93      0.91      0.92       156
weighted avg       0.93      0.93      0.93       156



In [6]:
automlCompete.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Default_LightGBM,LightGBM,logloss,0.462574,3582.49
,11_LightGBM,LightGBM,logloss,0.35717,2747.43
the best,12_LightGBM,LightGBM,logloss,0.339918,2619.86
,Ensemble,Ensemble,logloss,0.339918,0.15

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.882353,0.846154,0.813953,0.857143,0.885615,0.861675,0.35717
recall,0.833333,0.75,0.901639,0.853659,0.857143,0.834658,0.857143,0.35717
f1-score,0.909091,0.810811,0.873016,0.833333,0.857143,0.856563,0.857146,0.35717
support,18.0,20.0,61.0,41.0,0.857143,140.0,140.0,0.35717

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,15,2,1,0
Labeled as 1,0,15,3,2
Labeled as 2,0,0,55,6
Labeled as 3,0,0,6,35

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.875,0.857143,0.782609,0.85,0.878688,0.856233,0.339918
recall,0.833333,0.7,0.885246,0.878049,0.85,0.824157,0.85,0.339918
f1-score,0.909091,0.777778,0.870968,0.827586,0.85,0.846356,0.849852,0.339918
support,18.0,20.0,61.0,41.0,0.85,140.0,140.0,0.339918

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,15,2,1,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,54,7
Labeled as 3,0,0,5,36

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.7,0.870968,0.782609,0.828571,0.838394,0.837257,0.462574
recall,0.666667,0.7,0.885246,0.878049,0.828571,0.78249,0.828571,0.462574
f1-score,0.8,0.7,0.878049,0.827586,0.828571,0.801409,0.8278,0.462574
support,18.0,20.0,61.0,41.0,0.828571,140.0,140.0,0.462574

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,12,6,0,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,54,7
Labeled as 3,0,0,5,36

Model,Weight
12_LightGBM,1

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.875,0.857143,0.782609,0.85,0.878688,0.856233,0.339918
recall,0.833333,0.7,0.885246,0.878049,0.85,0.824157,0.85,0.339918
f1-score,0.909091,0.777778,0.870968,0.827586,0.85,0.846356,0.849852,0.339918
support,18.0,20.0,61.0,41.0,0.85,140.0,140.0,0.339918

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,15,2,1,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,54,7
Labeled as 3,0,0,5,36


In [7]:
joblib.dump(automlCompete, 'Saved models & utils/automlCompete93strong.pkl')

['automlCompete93strong.pkl']