In [1]:
#import all the required libraries

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
# load the dataset and assign X and Y variables
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target)

In [4]:
# Perform the Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# Decision tree
dt = DecisionTreeClassifier()
params = {'criterion': ['gini', 'entropy'],                   # splitting rule
          'max_depth': [None, 3, 5, 10, 20],                  # how deep the tree can go
          'min_samples_split': [2, 5, 10, 20],                # min samples needed to split a node
          'min_samples_leaf': [1, 2, 4, 10],                  # min samples required at a leaf
          'max_features': [None, 'sqrt', 'log2'],             # features considered when splitting
          'class_weight': [None, 'balanced']}                 # balance class distribution
grid_search = GridSearchCV(estimator=dt, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
dt_pred = grid_search.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)
dt_cm = confusion_matrix(y_test, dt_pred)
dt_cr = classification_report(y_test, dt_pred)

In [6]:
# Logistic regression
lr = LogisticRegression()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())                
])

params = {
    'logreg__penalty': ['l2'],                      # type of regularization
    'logreg__C': [0.01, 0.1, 1],                    # inverse of regularization strength (smaller = stronger regularization)
    'logreg__solver': ['lbfgs', 'liblinear'],       # optimization algorithm (depends on penalty type)
    'logreg__max_iter': [500, 1000]}                # max number of iterations for solver to converge

grid_search = GridSearchCV(pipe, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
lr_pred = grid_search.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_cm = confusion_matrix(y_test, lr_pred)
lr_cr = classification_report(y_test, lr_pred)

In [7]:
# KNN
knn = KNeighborsClassifier()
params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],                    # number of neighbors to consider
    'weights': ['uniform', 'distance'],                     # all neighbors equal vs closer neighbors weighted more
    'metric': ['euclidean', 'manhattan', 'minkowski'],      # distance function
    'p': [1, 2]}                                            # power parameter (1=Manhattan, 2=Euclidean, only for Minkowski)

grid_search = GridSearchCV(estimator=knn, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
knn_pred = grid_search.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
knn_cm = confusion_matrix(y_test, knn_pred)
knn_cr = classification_report(y_test, knn_pred)

In [8]:
# Perform Model evaluation for Decision Tree
print('Decision Tree:\n')
print(f'Accuracy: {dt_acc:.4f}\n')
print('Confusion matrix:\n', dt_cm)
print('\nClassification report:\n', dt_cr)

Decision Tree:

Accuracy: 0.9580

Confusion matrix:
 [[47  3]
 [ 3 90]]

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        50
           1       0.97      0.97      0.97        93

    accuracy                           0.96       143
   macro avg       0.95      0.95      0.95       143
weighted avg       0.96      0.96      0.96       143



In [9]:
# Perform Model evaluation for Logistic Regression
print('\nLogistic Regression:\n')
print(f'Accuracy: {lr_acc:.4f}\n')
print('Confusion matrix:\n', lr_cm)
print('\nClassification report:\n', lr_cr)


Logistic Regression:

Accuracy: 0.9860

Confusion matrix:
 [[49  1]
 [ 1 92]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        50
           1       0.99      0.99      0.99        93

    accuracy                           0.99       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143



In [10]:
# Perform Model evaluation for KNN
print('\nKNN:\n')
print(f'Accuracy: {knn_acc:.4f}\n')
print('Confusion matrix:\n', knn_cm)
print('\nClassification report:\n', knn_cr)


KNN:

Accuracy: 0.9441

Confusion matrix:
 [[46  4]
 [ 4 89]]

Classification report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92        50
           1       0.96      0.96      0.96        93

    accuracy                           0.94       143
   macro avg       0.94      0.94      0.94       143
weighted avg       0.94      0.94      0.94       143

