In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report

In [2]:
df = load_breast_cancer()

In [3]:
df.data.shape

(569, 30)

In [4]:
df.target.shape

(569,)

In [5]:
print(df.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(
    df.data, df.target, test_size=0.2, random_state=42, shuffle=True
)

In [7]:
xtrain.shape

(455, 30)

In [8]:
xtest.shape

(114, 30)

In [9]:
ytrain.shape

(455,)

In [10]:
ytest.shape

(114,)

In [11]:
scaler = StandardScaler()
ohe = OneHotEncoder()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [12]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [13]:
pipe = Pipeline([("classifire", None)])

In [14]:
param_grid = [
    {"classifire": [GaussianNB()]},
    {
        "classifire": [DecisionTreeClassifier()],
        "classifire__criterion": ["gini", "entropy"],
        "classifire__splitter": ["best", "random"],
        "classifire__max_depth": [2, 4, 6, 8, 10, 12, 20, 40, 50, 70, 90, None],
        "classifire__max_features": [None, "auto", "log2"],
    },
    {
        "classifire": [SVC()],
        "classifire__kernel": ["linear", "poly", "rbf", "sigmoid"],
        "classifire__gamma": ["scale", "auto"],
        "classifire__decision_function_shape": ["ovo", "ovr"],
    },
    {
        "classifire": [KNeighborsClassifier()],
        "classifire__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        "classifire__weights": ["uniform", "distance", None],
        "classifire__n_jobs": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
    },
    {
        "classifire": [RandomForestClassifier()],
        "classifire__n_estimators": [100, 120, 130, 140, 150, 200, 300, 400],
        "classifire__criterion": ["gini", "entropy", "log_loss"],
        "classifire__max_features": ["sqrt", "log2"],
    },
]
param_grid

[{'classifire': [GaussianNB()]},
 {'classifire': [DecisionTreeClassifier()],
  'classifire__criterion': ['gini', 'entropy'],
  'classifire__splitter': ['best', 'random'],
  'classifire__max_depth': [2, 4, 6, 8, 10, 12, 20, 40, 50, 70, 90, None],
  'classifire__max_features': [None, 'auto', 'log2']},
 {'classifire': [SVC()],
  'classifire__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
  'classifire__gamma': ['scale', 'auto'],
  'classifire__decision_function_shape': ['ovo', 'ovr']},
 {'classifire': [KNeighborsClassifier()],
  'classifire__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
  'classifire__weights': ['uniform', 'distance', None],
  'classifire__n_jobs': [1, 2, 3, 4, 5, 6, 7, 8, 9, None]},
 {'classifire': [RandomForestClassifier()],
  'classifire__n_estimators': [100, 120, 130, 140, 150, 200, 300, 400],
  'classifire__criterion': ['gini', 'entropy', 'log_loss'],
  'classifire__max_features': ['sqrt', 'log2']}]

In [15]:
gs = GridSearchCV(
    pipe, param_grid=param_grid, cv=10, scoring="accuracy", verbose=1, n_jobs=1
)
gs

In [None]:
gs.fit(xtrain, ytrain)

In [17]:
gs.best_estimator_

In [18]:
gs.best_params_

{'classifire': SVC(decision_function_shape='ovo'),
 'classifire__decision_function_shape': 'ovo',
 'classifire__gamma': 'scale',
 'classifire__kernel': 'rbf'}

In [20]:
pred = gs.predict(xtest)

In [21]:
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

