### Setup

In [4]:
# !pip install -q scikit-learn numpy pandas matplotlib seaborn

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

%matplotlib inline


### Load data and preprocessing

In [5]:
df = pd.read_csv("/content/WDBC.csv")  # If working from raw again

# Drop ID
df = df.drop(columns=[col for col in df.columns if col.lower() == "id"])
df["Diagnosis"] = df["Diagnosis"].replace({"M": 1, "B": 0})

y = df["Diagnosis"]
X = df.drop(columns=["Diagnosis"])

# Normalize features
scaler = StandardScaler()
Xs = scaler.fit_transform(X)


  df["Diagnosis"] = df["Diagnosis"].replace({"M": 1, "B": 0})


### Train-Test Split

In [6]:
# Use normalised features (Xs) and labels (y)
# If using PCA-reduced df:
X = df.drop(columns=["Diagnosis"])
y = df["Diagnosis"]

Xs_train, Xs_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=1
)

print("Train shape:", Xs_train.shape)
print("Test shape:", Xs_test.shape)


Train shape: (398, 30)
Test shape: (171, 30)


### ML Models

#### Logistic regression

In [7]:
log_reg = LogisticRegression()
log_reg.fit(Xs_train, y_train)

log_acc = log_reg.score(Xs_test, y_test)
print(f"Logistic Regression Accuracy: {log_acc:.2f}")


Logistic Regression Accuracy: 0.95


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Gaussian Naive Bayes

In [8]:
gnb_clf = GaussianNB()
gnb_clf.fit(Xs_train, y_train)

gnb_acc = gnb_clf.score(Xs_test, y_test)
print(f"Naive Bayes Accuracy: {gnb_acc:.2f}")


Naive Bayes Accuracy: 0.95


#### K Nearest neighbour

In [9]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(Xs_train, y_train)

knn_acc = knn_clf.score(Xs_test, y_test)
print(f"KNN Accuracy (k=3): {knn_acc:.2f}")


KNN Accuracy (k=3): 0.91


#### Suport Vector Machine

In [10]:
svm_clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', probability=True)
svm_clf.fit(Xs_train, y_train)

svm_acc = svm_clf.score(Xs_test, y_test)
print(f"SVM Accuracy: {svm_acc:.2f}")

SVM Accuracy: 0.63


### Results

In [11]:
print("Model Accuracy Comparison:")
print(f" - Logistic Regression: {log_acc:.2f}")
print(f" - Naive Bayes       : {gnb_acc:.2f}")
print(f" - KNN (k=3)          : {knn_acc:.2f}")
print(f" - SVM (RBF kernel)   : {svm_acc:.2f}")


Model Accuracy Comparison:
 - Logistic Regression: 0.95
 - Naive Bayes       : 0.95
 - KNN (k=3)          : 0.91
 - SVM (RBF kernel)   : 0.63
