In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import log_loss, accuracy_score

In [4]:
df = pd.read_csv('../Cases/Wisconsin/BreastCancer.csv')
df.head()

Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant
3,95719,6,10,10,10,8,10,7,10,7,Malignant
4,128059,1,1,1,1,2,5,5,1,1,Benign


In [5]:
X = df.iloc[:, 1:-1]
y = df['Class']

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [16]:
svm = SVC(probability=True, random_state=42)
lr = LogisticRegression()
nb = GaussianNB()

voting = VotingClassifier(estimators=[('SVM', svm),
                                     ('LR', lr),
                                     ('NB', nb)],
                          voting='soft')
voting.fit(X_train, y_train)

In [17]:
y_pred_prob = voting.predict_proba(X_test)[:, 1]
log_loss(y_test, y_pred_prob)

0.08018728882045023

In [18]:
y_pred = voting.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714285714285714

In [22]:
voting.get_params()

{'estimators': [('SVM', SVC(probability=True, random_state=42)),
  ('LR', LogisticRegression()),
  ('NB', GaussianNB())],
 'flatten_transform': True,
 'n_jobs': None,
 'verbose': False,
 'voting': 'soft',
 'weights': None,
 'SVM': SVC(probability=True, random_state=42),
 'LR': LogisticRegression(),
 'NB': GaussianNB(),
 'SVM__C': 1.0,
 'SVM__break_ties': False,
 'SVM__cache_size': 200,
 'SVM__class_weight': None,
 'SVM__coef0': 0.0,
 'SVM__decision_function_shape': 'ovr',
 'SVM__degree': 3,
 'SVM__gamma': 'scale',
 'SVM__kernel': 'rbf',
 'SVM__max_iter': -1,
 'SVM__probability': True,
 'SVM__random_state': 42,
 'SVM__shrinking': True,
 'SVM__tol': 0.001,
 'SVM__verbose': False,
 'LR__C': 1.0,
 'LR__class_weight': None,
 'LR__dual': False,
 'LR__fit_intercept': True,
 'LR__intercept_scaling': 1,
 'LR__l1_ratio': None,
 'LR__max_iter': 100,
 'LR__multi_class': 'auto',
 'LR__n_jobs': None,
 'LR__penalty': 'l2',
 'LR__random_state': None,
 'LR__solver': 'lbfgs',
 'LR__tol': 0.0001,
 'LR__v

In [23]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {'SVM__gamma': ['scale', 'auto'],
          'SVM__C': np.linspace(0.001, 5, 10),
          'LR__penalty': ['l1', 'l2', 'elastic', None],
          'NB__var_smoothing': np.linspace(0.0001, 0.999, 10)}
gcv = GridSearchCV(voting, param_grid=params, verbose=3, cv=kfold, scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
[CV 1/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=scale;, score=nan total time=   0.1s
[CV 2/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=scale;, score=nan total time=   0.1s
[CV 3/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=scale;, score=nan total time=   0.1s
[CV 4/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=scale;, score=nan total time=   0.1s
[CV 5/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=scale;, score=nan total time=   0.1s
[CV 1/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=auto;, score=nan total time=   0.1s
[CV 2/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=auto;, score=nan total time=   0.1s
[CV 3/5] END LR__penalty=l1, NB__var_smoothing=0.0001, SVM__C=0.001, SVM__gamma=auto;, score=nan total time=   0.1s
[CV

2000 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/ensemble/_voting.py", line 349, in fit
    return super().fit(X, transformed_y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [24]:
print(gcv.best_params_)

{'LR__penalty': 'l2', 'NB__var_smoothing': 0.3330666666666666, 'SVM__C': 2.778222222222222, 'SVM__gamma': 'auto'}


In [None]:
bm = gcv.best_estimator_