# Chapter 6: Best Practices and Hyperparameter Tuning Applications

In [15]:
# import necessary libraries
import pandas as pd
import numpy as np

In [16]:
# Loading the Breast Cancer Wisconsin (Diagnostic) Dataset
from sklearn.datasets import load_breast_cancer
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)


In [17]:
# The first two columns are ID and target (M = malignant, B = benign), the rest are features
# Is necessary convert the second column to numeric values

from sklearn.preprocessing import LabelEncoder # for encoding categorical target variable

X = df.iloc[:, 2:].values  # Features
y = df.iloc[:, 1].values   # Target variable

le = LabelEncoder()
y = le.fit_transform(y)  # Convert 'M'/'B' to 1/0
#le.classes_  # Check the classes after encoding
#le.transform(['M', 'B'])  # Verify the transformation





In [18]:
# Split the dataset into trainning (80%) and testing (20%) sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [19]:
# Combining transformations and estimating using Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Create a pipeline that standardizes the data, applies PCA, and then fits a logistic regression model
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(random_state=1))

# Fit the model
pipe_lr.fit(X_train, y_train)

# Evaluate the model
score = pipe_lr.score(X_test, y_test)
print(f'Test Accuracy: {score:.3f}')

Test Accuracy: 0.956


# K-Fold Cross-Validation

In [22]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []
for k, (train, test)in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print(f'Fold: {k+1}, Class dist.: {np.bincount(y_train[train])}, Acc: {score:.3f}')

mean_score = np.mean(scores)
std_score = np.std(scores)
print(f'\nCV accuracy: {mean_score:.3f} +/- {std_score:.3f}')

Fold: 1, Class dist.: [256 153], Acc: 0.935
Fold: 2, Class dist.: [256 153], Acc: 0.935
Fold: 3, Class dist.: [256 153], Acc: 0.957
Fold: 4, Class dist.: [256 153], Acc: 0.957
Fold: 5, Class dist.: [256 153], Acc: 0.935
Fold: 6, Class dist.: [257 153], Acc: 0.956
Fold: 7, Class dist.: [257 153], Acc: 0.978
Fold: 8, Class dist.: [257 153], Acc: 0.933
Fold: 9, Class dist.: [257 153], Acc: 0.956
Fold: 10, Class dist.: [257 153], Acc: 0.956

CV accuracy: 0.950 +/- 0.014
