# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pickle
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#!pip install mljar-supervised
from supervised.automl import AutoML

## Export data

In [2]:
with open('Saved data/X_train.pickle', 'rb') as data:
    X_train = pickle.load(data)
    
with open('Saved data/X_test.pickle', 'rb') as data:
    X_test = pickle.load(data)

with open('Saved data/X_train_scaled.pickle', 'rb') as data:
    X_train_scaled = pickle.load(data)
    
with open('Saved data/X_test_scaled.pickle', 'rb') as data:
    X_test_scaled = pickle.load(data)
    
with open('Saved data/X_train_pca.pickle', 'rb') as data:
    X_train_pca = pickle.load(data)
    
with open('Saved data/X_test_pca.pickle', 'rb') as data:
    X_test_pca = pickle.load(data)
    
with open('Saved data/X_train_autoencoder.pickle', 'rb') as data:
    X_train_autoencoder = pickle.load(data)
    
with open('Saved data/X_test_autoencoder.pickle', 'rb') as data:
    X_test_autoencoder = pickle.load(data)
    
with open('Saved data/y_train.pickle', 'rb') as data:
    y_train = pickle.load(data)
    
with open('Saved data/y_test.pickle', 'rb') as data:
    y_test = pickle.load(data)
    
with open('Saved data/labels.pickle', 'rb') as data:
    labels = pickle.load(data)

## Random Forest

In [11]:
# RandomForest
randforestClassifier = RandomForestClassifier()
randforestClassifier.fit(X_train, y_train)
randforestPredictions = randforestClassifier.predict(X_test)
print(classification_report(y_test, randforestPredictions, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.80      0.73      0.76        22
        LumA       0.90      0.94      0.92        68
        LumB       0.81      0.83      0.82        46

    accuracy                           0.87       156
   macro avg       0.88      0.85      0.86       156
weighted avg       0.87      0.87      0.87       156



In [12]:
# RandomForest + Scaled
randforestClassifierScaled = RandomForestClassifier()
randforestClassifierScaled.fit(X_train_scaled, y_train)
randforestPredictionsScaled = randforestClassifierScaled.predict(X_test_scaled)
print(classification_report(y_test, randforestPredictionsScaled, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.85      0.92        20
        Her2       0.78      0.64      0.70        22
        LumA       0.92      0.96      0.94        68
        LumB       0.80      0.87      0.83        46

    accuracy                           0.87       156
   macro avg       0.87      0.83      0.85       156
weighted avg       0.87      0.87      0.87       156



In [14]:
# RandomForest + PCA
randforestClassifierPCA = RandomForestClassifier()
randforestClassifierPCA.fit(X_train_pca, y_train)
randforestPredictionsPCA = randforestClassifierPCA.predict(X_test_pca)
print(classification_report(y_test, randforestPredictionsPCA, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.35      0.52        20
        Her2       1.00      0.23      0.37        22
        LumA       0.51      1.00      0.68        68
        LumB       0.64      0.15      0.25        46

    accuracy                           0.56       156
   macro avg       0.79      0.43      0.45       156
weighted avg       0.68      0.56      0.49       156



## Logistic Regression

In [3]:
# LogisticRegression
logregClassifier = LogisticRegression()
logregClassifier.fit(X_train, y_train)
logregPredictions = logregClassifier.predict(X_test)
print(classification_report(y_test, logregPredictions, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.85      0.77      0.81        22
        LumA       0.93      0.96      0.94        68
        LumB       0.87      0.89      0.88        46

    accuracy                           0.91       156
   macro avg       0.91      0.89      0.90       156
weighted avg       0.91      0.91      0.91       156



In [4]:
# LogisticRegression + Scaled
logregClassifierScaled = LogisticRegression()
logregClassifierScaled.fit(X_train_scaled, y_train)
logregPredictionsScaled = logregClassifierScaled.predict(X_test_scaled)
print(classification_report(y_test, logregPredictionsScaled, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.86      0.82      0.84        22
        LumA       0.94      0.96      0.95        68
        LumB       0.85      0.89      0.87        46

    accuracy                           0.91       156
   macro avg       0.91      0.89      0.90       156
weighted avg       0.91      0.91      0.91       156



In [17]:
# LogisticRegression + PCA
logregClassifierPCA = LogisticRegression()
logregClassifierPCA.fit(X_train_pca, y_train)
logregPredictionsPCA = logregClassifierPCA.predict(X_test_pca)
print(classification_report(y_test, logregPredictionsPCA, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.85      0.77      0.81        22
        LumA       0.94      0.94      0.94        68
        LumB       0.82      0.89      0.85        46

    accuracy                           0.90       156
   macro avg       0.90      0.88      0.89       156
weighted avg       0.90      0.90      0.90       156



In [4]:
# LogisticRegression + AutoEncoder
logregClassifierAE = LogisticRegression()
logregClassifierAE.fit(X_train_autoencoder, y_train)
logregPredictionsAE = logregClassifierAE.predict(X_test_autoencoder)
print(classification_report(y_test, logregPredictionsAE, target_names=labels))

              precision    recall  f1-score   support

       Basal       0.95      0.90      0.92        20
        Her2       0.90      0.82      0.86        22
        LumA       0.82      0.88      0.85        68
        LumB       0.75      0.72      0.73        46

    accuracy                           0.83       156
   macro avg       0.85      0.83      0.84       156
weighted avg       0.83      0.83      0.83       156



## Ridge Classifier

In [30]:
# RidgeClassifier
ridgeClassifier = RidgeClassifier()
ridgeClassifier.fit(X_train, y_train)
ridgePredictions = ridgeClassifier.predict(X_test)
print(classification_report(y_test, ridgePredictions, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.88      0.64      0.74        22
        LumA       0.91      0.94      0.93        68
        LumB       0.81      0.91      0.86        46

    accuracy                           0.88       156
   macro avg       0.90      0.85      0.87       156
weighted avg       0.89      0.88      0.88       156



In [31]:
# RidgeClassifier + Scaled
ridgeClassifierScaled = RidgeClassifier()
ridgeClassifierScaled.fit(X_train_scaled, y_train)
ridgePredictionsScaled = ridgeClassifierScaled.predict(X_test_scaled)
print(classification_report(y_test, ridgePredictionsScaled, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.88      0.64      0.74        22
        LumA       0.91      0.94      0.93        68
        LumB       0.81      0.91      0.86        46

    accuracy                           0.88       156
   macro avg       0.90      0.85      0.87       156
weighted avg       0.89      0.88      0.88       156



In [32]:
# RidgeClassifier + PCA
ridgeClassifierPCA = RidgeClassifier()
ridgeClassifierPCA.fit(X_train_pca, y_train)
ridgePredictionsPCA = ridgeClassifierPCA.predict(X_test_pca)
print(classification_report(y_test, ridgePredictionsPCA, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.84      0.73      0.78        22
        LumA       0.88      0.90      0.89        68
        LumB       0.78      0.85      0.81        46

    accuracy                           0.86       156
   macro avg       0.88      0.84      0.86       156
weighted avg       0.86      0.86      0.86       156



## Naive Bayes

In [33]:
# BayesClassifier
bayesClassifier = GaussianNB()
bayesClassifier.fit(X_train, y_train)
bayesPredictions = bayesClassifier.predict(X_test)
print(classification_report(y_test, bayesPredictions, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.80      0.89        20
        Her2       0.69      0.82      0.75        22
        LumA       0.91      0.94      0.93        68
        LumB       0.82      0.78      0.80        46

    accuracy                           0.86       156
   macro avg       0.86      0.84      0.84       156
weighted avg       0.87      0.86      0.86       156



In [34]:
# BayesClassifier + Scaled
bayesClassifierScaled = GaussianNB()
bayesClassifierScaled.fit(X_train_scaled, y_train)
bayesPredictionsScaled = bayesClassifierScaled.predict(X_test_scaled)
print(classification_report(y_test, bayesPredictionsScaled, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.80      0.89        20
        Her2       0.69      0.82      0.75        22
        LumA       0.91      0.94      0.93        68
        LumB       0.82      0.78      0.80        46

    accuracy                           0.86       156
   macro avg       0.86      0.84      0.84       156
weighted avg       0.87      0.86      0.86       156



In [35]:
# BayesClassifier + PCA
bayesClassifierPCA = GaussianNB()
bayesClassifierPCA.fit(X_train_pca, y_train)
bayesPredictionsPCA = bayesClassifierPCA.predict(X_test_pca)
print(classification_report(y_test, bayesPredictionsPCA, target_names=labels))

              precision    recall  f1-score   support

       Basal       0.00      0.00      0.00        20
        Her2       0.00      0.00      0.00        22
        LumA       0.44      1.00      0.61        68
        LumB       0.00      0.00      0.00        46

    accuracy                           0.44       156
   macro avg       0.11      0.25      0.15       156
weighted avg       0.19      0.44      0.26       156



## SVM

In [36]:
# SVM
svmClassifier = LinearSVC()
svmClassifier.fit(X_train, y_train)
svmPredictions = svmClassifier.predict(X_test)
print(classification_report(y_test, svmPredictions, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.85      0.77      0.81        22
        LumA       0.95      0.93      0.94        68
        LumB       0.81      0.91      0.86        46

    accuracy                           0.90       156
   macro avg       0.90      0.88      0.89       156
weighted avg       0.90      0.90      0.90       156



In [37]:
# SVM + Scaled
svmClassifierScaled = LinearSVC()
svmClassifierScaled.fit(X_train_scaled, y_train)
svmPredictionsScaled = svmClassifierScaled.predict(X_test_scaled)
print(classification_report(y_test, svmPredictionsScaled, target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.90      0.95        20
        Her2       0.84      0.73      0.78        22
        LumA       0.94      0.91      0.93        68
        LumB       0.79      0.91      0.85        46

    accuracy                           0.88       156
   macro avg       0.89      0.86      0.88       156
weighted avg       0.89      0.88      0.89       156



In [38]:
# SVM + PCA
svmClassifierPCA = LinearSVC()
svmClassifierPCA.fit(X_train_pca, y_train)
svmPredictionsPCA = svmClassifierPCA.predict(X_test_pca)
print(classification_report(y_test, svmPredictionsPCA, target_names=labels))

              precision    recall  f1-score   support

       Basal       0.95      0.95      0.95        20
        Her2       0.84      0.73      0.78        22
        LumA       0.93      0.94      0.93        68
        LumB       0.83      0.87      0.85        46

    accuracy                           0.89       156
   macro avg       0.89      0.87      0.88       156
weighted avg       0.89      0.89      0.89       156



## AutoML 

In [40]:
automl = AutoML(mode="Perform")
automl.fit(X_train_scaled.to_numpy(), y_train.to_numpy().flatten())

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['LightGBM', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 2 models


2022-05-12 18:33:11,573 concurrent.futures ERROR exception calling callback for <Future at 0x11a15b923b0 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "D:\Anaconda\envs\cancer\lib\site-packages\joblib\externals\loky\process_executor.py", line 407, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "D:\Anaconda\envs\cancer\lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
  File "D:\Anaconda\envs\cancer\lib\site-packages\lightgbm\basic.py", line 2690, in __setstate__
    _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
  File "D:\Anaconda\envs\cancer\lib\site-packages\lightgbm\basic.py", line 125, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
lightgbm.basic.LightGBMError: bad allocation
"""

The above exception was the direct cause of the following exception:

Traceback (most recent ca

A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
Problem during computing permutation importance. Skipping ...
1_Default_LightGBM logloss 0.462574 trained in 1476.02 seconds (1-sample predict time 12.6808 seconds)
* Step not_so_random will try to check up to 8 models
2_LightGBM logloss 0.35717 trained in 2857.26 seconds (1-sample predict time 12.9719 seconds)
Skip golden_features because no parameters were generated.
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
Skip hill_climbing_1 because of the time limit.
Skip hill_climbing_2 because of the time limit.
* Step ensemble will try to check up to 1 model
Ensemble not trained. Can't contruct ensemble with prediction time smaller than limit.
****************************************************************
There were no model with prediction time smaller than the limit.
Please increase the prediction time f

2022-05-12 19:21:53,331 supervised.exceptions ERROR Missing column: feature_1 in input data. Cannot predict


AutoML fit time: 4427.17 seconds
AutoML best model: 2_LightGBM


AutoMLException: Missing column: feature_1 in input data. Cannot predict

In [41]:
# Saved to AutoML1_perform/2_LightGBM
automlPredictions = automl.predict_all(X_test_scaled.to_numpy())
print(classification_report(y_test, automlPredictions['label'].to_numpy(), target_names=labels))

              precision    recall  f1-score   support

       Basal       1.00      0.95      0.97        20
        Her2       0.89      0.77      0.83        22
        LumA       0.93      1.00      0.96        68
        LumB       0.91      0.89      0.90        46

    accuracy                           0.93       156
   macro avg       0.93      0.90      0.92       156
weighted avg       0.93      0.93      0.93       156



In [45]:
joblib.dump(automl, 'Saved models & utils/autoMLPerform93.pkl')

['autoMLPerform93.pkl']