## Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib
import pickle

from sklearn.metrics import classification_report

In [2]:
#!pip install mljar-supervised
from supervised.automl import AutoML

## Export data

In [5]:
with open('Saved data/X_train_scaled.pickle', 'rb') as data:
    X_train_scaled = pickle.load(data)
    
with open('Saved data/X_test_scaled.pickle', 'rb') as data:
    X_test_scaled = pickle.load(data)

with open('Saved data/y_train.pickle', 'rb') as data:
    y_train = pickle.load(data)
    
with open('Saved data/y_test.pickle', 'rb') as data:
    y_test = pickle.load(data)
    
with open('Saved data/labels.pickle', 'rb') as data:
    labels = pickle.load(data)

## AutoML compete mode 

In [4]:
automl = AutoML(mode="Compete")
automl.fit(X_train_scaled, y_train.to_numpy().flatten())

Linear algorithm was disabled.
Disable stacking for split validation
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'LightGBM', 'Neural Network', 'Nearest Neighbors']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree logloss 0.59591 trained in 813.44 seconds
2_DecisionTree logloss 0.652404 trained in 860.55 seconds
3_DecisionTree logloss 0.652404 trained in 808.13 seconds
Skip default_algorithms because of the time limit.
Skip not_so_random because of the time limit.
Skip golden_features because no parameters were generated.
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were genera

NameError: name 'X_test' is not defined

In [5]:
automlPredictions = automl.predict_all(X_test_scaled)
print(classification_report(y_test, automlPredictions['label'].to_numpy().flatten(), target_names=labels))

              precision    recall  f1-score   support

       Basal       0.95      0.90      0.92        20
        Her2       0.87      0.59      0.70        22
        LumA       0.83      0.96      0.89        68
        LumB       0.82      0.78      0.80        46

    accuracy                           0.85       156
   macro avg       0.87      0.81      0.83       156
weighted avg       0.85      0.85      0.84       156



## AutoML compete mode (on scaled data)

In [5]:
automlCompete = AutoML(
    mode="Compete",
    total_time_limit=3*3600,
    algorithms=["LightGBM", "Xgboost"],
    stack_models=True,
    train_ensemble=True,
    boost_on_errors=True,
    optuna_verbose=True,
    explain_level=2,
    ml_task = 'multiclass_classification',
)
automlCompete.fit(X_train_scaled, y_train.to_numpy().flatten())

Disable stacking for split validation
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['LightGBM', 'Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 2 models
1_Default_LightGBM logloss 0.450024 trained in 3957.97 seconds
* Step not_so_random will try to check up to 18 models
11_LightGBM logloss 0.343463 trained in 2646.03 seconds
2_Xgboost logloss 0.384169 trained in 4790.54 seconds
Skip golden_features because no parameters were generated.
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
Skip hill_climbing_1 because of the time limit.
Ski

AutoML(algorithms=['LightGBM', 'Xgboost'], boost_on_errors=True,
       explain_level=2, ml_task='multiclass_classification', mode='Compete',
       stack_models=True, total_time_limit=10800)

In [6]:
automlCompetePredictions = automlCompete.predict_all(X_test_scaled)
print(classification_report(y_test, automlCompetePredictions['label'].to_numpy().flatten(), target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.81      0.77      0.79        22
        LumA       0.93      1.00      0.96        68
        LumB       0.91      0.85      0.88        46

    accuracy                           0.92       156
   macro avg       0.91      0.89      0.90       156
weighted avg       0.92      0.92      0.92       156



In [7]:
automlCompete.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Default_LightGBM,LightGBM,logloss,0.450024,3960.8
the best,11_LightGBM,LightGBM,logloss,0.343463,2646.73
,2_Xgboost,Xgboost,logloss,0.384169,4791.23
,Ensemble,Ensemble,logloss,0.343463,0.14

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.875,0.873016,0.804348,0.864286,0.888091,0.869516,0.343463
recall,0.833333,0.7,0.901639,0.902439,0.864286,0.834353,0.864286,0.343463
f1-score,0.909091,0.777778,0.887097,0.850575,0.864286,0.856135,0.863612,0.343463
support,18.0,20.0,61.0,41.0,0.864286,140.0,140.0,0.343463

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,15,2,1,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,55,6
Labeled as 3,0,0,4,37

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.7,0.859375,0.795455,0.828571,0.838707,0.835968,0.450024
recall,0.666667,0.7,0.901639,0.853659,0.828571,0.780491,0.828571,0.450024
f1-score,0.8,0.7,0.88,0.823529,0.828571,0.800882,0.827462,0.450024
support,18.0,20.0,61.0,41.0,0.828571,140.0,140.0,0.450024

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,12,6,0,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,55,6
Labeled as 3,0,0,6,35

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.777778,0.861538,0.818182,0.85,0.864375,0.854678,0.384169
recall,0.722222,0.7,0.918033,0.878049,0.85,0.804576,0.85,0.384169
f1-score,0.83871,0.736842,0.888889,0.847059,0.85,0.827875,0.848466,0.384169
support,18.0,20.0,61.0,41.0,0.85,140.0,140.0,0.384169

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,13,4,1,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,56,5
Labeled as 3,0,0,5,36

Model,Weight
11_LightGBM,1

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.875,0.873016,0.804348,0.864286,0.888091,0.869516,0.343463
recall,0.833333,0.7,0.901639,0.902439,0.864286,0.834353,0.864286,0.343463
f1-score,0.909091,0.777778,0.887097,0.850575,0.864286,0.856135,0.863612,0.343463
support,18.0,20.0,61.0,41.0,0.864286,140.0,140.0,0.343463

Unnamed: 0,Predicted as 0,Predicted as 1,Predicted as 2,Predicted as 3
Labeled as 0,15,2,1,0
Labeled as 1,0,14,3,3
Labeled as 2,0,0,55,6
Labeled as 3,0,0,4,37


In [8]:
joblib.dump(automlCompete, 'Saved models & utils/automlCompeteScaled.pkl')

['automlCompeteScaled.pkl']

In [9]:
automlCompete2 = AutoML(
    mode="Compete",
    total_time_limit=4.5*3600,
    explain_level=2,
    ml_task = 'multiclass_classification',
)
automlCompete2.fit(X_train_scaled, y_train.to_numpy().flatten())

ValueError: 'total_time_limit' must be an integer, got '<class 'float'>'.

In [None]:
automlCompetePredictions2 = automlCompete2.predict_all(X_test_scaled)
print(classification_report(y_test, automlCompetePredictions2['label'].to_numpy().flatten(), target_names=labels))

In [None]:
automlCompete2.report()

In [None]:
joblib.dump(automlCompete2, 'Saved models & utils/automlCompeteScaled2.pkl')

## Experimenting (not needed code)

In [7]:
model = joblib.load('automlCompete93strong.pkl')

In [8]:
modelPrediction = model.predict_all(X_test.to_numpy())

In [42]:
modelPredProba = model.predict_proba(X_test.to_numpy())

In [9]:
print(classification_report(y_test, modelPrediction['label'].to_numpy().flatten(), target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.86      0.82      0.84        22
        LumA       0.93      1.00      0.96        68
        LumB       0.93      0.87      0.90        46

    accuracy                           0.93       156
   macro avg       0.93      0.91      0.92       156
weighted avg       0.93      0.93      0.93       156



In [25]:
model.score(X_test.to_numpy(), y_test)

0.9294871794871795

In [1]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [3]:
import pickle 
import joblib

In [4]:
with open('Saved data/X_train.pickle', 'rb') as data:
    X_train = pickle.load(data)
    
with open('Saved data/X_test.pickle', 'rb') as data:
    X_test = pickle.load(data)

with open('Saved data/y_train.pickle', 'rb') as data:
    y_train = pickle.load(data)
    
with open('Saved data/y_test.pickle', 'rb') as data:
    y_test = pickle.load(data)
    
with open('Saved data/labels.pickle', 'rb') as data:
    labels = pickle.load(data)

In [19]:
model = LGBMClassifier(n_jobs= -1, objective= "multiclass", num_leaves=15, learning_rate=0.1, feature_fraction = 0.8,
                      bagging_fraction=0.5, min_data_in_leaf=50, num_class=4, explain_level=2)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(model, X_train.to_numpy(), y_train.to_numpy().flatten(), scoring='recall_macro', cv=cv, n_jobs=-1)
## report performance
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


In [20]:
np.mean(n_scores), np.std(n_scores)

NameError: name 'np' is not defined

In [21]:
model.fit(X_train.to_numpy(), y_train.to_numpy().flatten())


LGBMClassifier(bagging_fraction=0.5, explain_level=2, feature_fraction=0.8,
               min_data_in_leaf=50, num_class=4, num_leaves=15,
               objective='multiclass')

In [22]:
res = model.predict(X_test.to_numpy())

In [27]:
res_p = model.predict_proba(X_test.to_numpy())

In [24]:
from sklearn.metrics import classification_report

In [29]:
import pandas as pd
A = pd.DataFrame(res_p)

In [32]:
A['pred'] = res

In [33]:
A

Unnamed: 0,0,1,2,3,pred
0,5.420088e-02,1.036849e-01,0.009317,0.832798,3
1,3.084310e-07,8.149810e-07,0.999990,0.000009,2
2,3.313456e-05,2.388949e-04,0.928820,0.070908,2
3,9.634414e-01,3.567043e-02,0.000701,0.000187,0
4,5.755170e-06,1.037830e-03,0.998694,0.000263,2
...,...,...,...,...,...
151,9.999646e-01,1.525328e-05,0.000003,0.000018,0
152,2.181975e-04,6.846351e-04,0.633650,0.365448,2
153,9.999645e-01,5.526667e-06,0.000004,0.000026,0
154,9.999279e-01,1.271865e-05,0.000030,0.000029,0


In [25]:
print(classification_report(y_test, res, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.83      0.86      0.84        22
        LumA       0.92      0.99      0.95        68
        LumB       0.95      0.85      0.90        46

    accuracy                           0.92       156
   macro avg       0.92      0.91      0.92       156
weighted avg       0.93      0.92      0.92       156



In [26]:
joblib.dump(model, 'Saved models & utils/LightGBM.pkl')

['Saved models & utils/LightGBM.pkl']