In [1]:
import sklearn
from sklearn.datasets import get_data_home
from sklearn.datasets import load_breast_cancer
import time
import pandas as pd

In [2]:
X,y = load_breast_cancer(return_X_y=True)

We can use a threshold to select feature that increase the variance and decrease the biais. This filtering improve the risk of overfitting, we may stay aware of that possibility during the evaluation of the model.

In [3]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_sel = sel.fit_transform(X)

The feature selection kept 11 columns on 30.

In [4]:
X_sel.shape, X.shape

((569, 11), (569, 30))

In [5]:
pd.Series(y).value_counts()

1    357
0    212
dtype: int64

We can split the data in train and test samples, to evaluate models.

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
sss.get_n_splits(X_sel, y)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

For a quick overview, now we can test many classifiers and compare time for fitting and some metrics of evaluation for each modeling.

In [7]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [8]:
models = {
    "SVC": SVC,
    "Linear": LogisticRegression,
    "SGDC": SGDClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "GaussianProcessClassifier": GaussianProcessClassifier,
    "GaussianNB": GaussianNB,
    "MLPClassifier": MLPClassifier
         }

In [9]:
for name, model in models.items():
    start = time.time()
    clf = model().fit(X_train,y_train)
    end = time.time()
    y_pred = clf.predict(X=X_test)
    y_true = y_test
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    print(f"{name} : {end - start} seconds, accuracy : {acc}\nconfusion matrix : \n{cm}\n")

SVC : 0.013014554977416992 seconds, accuracy : 0.9122807017543859
confusion matrix : 
[[34  8]
 [ 2 70]]

Linear : 0.04766225814819336 seconds, accuracy : 0.9473684210526315
confusion matrix : 
[[39  3]
 [ 3 69]]

SGDC : 0.00325775146484375 seconds, accuracy : 0.8859649122807017
confusion matrix : 
[[38  4]
 [ 9 63]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier : 0.0003905296325683594 seconds, accuracy : 0.9122807017543859
confusion matrix : 
[[37  5]
 [ 5 67]]

RandomForestClassifier : 0.15900969505310059 seconds, accuracy : 0.956140350877193
confusion matrix : 
[[40  2]
 [ 3 69]]

GaussianProcessClassifier : 0.15772080421447754 seconds, accuracy : 0.8771929824561403
confusion matrix : 
[[37  5]
 [ 9 63]]

GaussianNB : 0.0011992454528808594 seconds, accuracy : 0.9210526315789473
confusion matrix : 
[[38  4]
 [ 5 67]]

MLPClassifier : 0.17904996871948242 seconds, accuracy : 0.9035087719298246
confusion matrix : 
[[38  4]
 [ 7 65]]



RandomForest and LogisticRegression are two classifiers that predict our data very well.