In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
# Column names from breast-cancer-wisconsin.names
col_names = [
    "Sample_code_number", "Clump_Thickness", "Uniformity_Cell_Size",
    "Uniformity_Cell_Shape", "Marginal_Adhesion", "Single_Epi_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"
]

# Load dataset
data = pd.read_csv("wisconsin.data", names=col_names)

# Replace missing values ("?") with NaN then fill with median
data.replace("?", np.nan, inplace=True)
data["Bare_Nuclei"] = pd.to_numeric(data["Bare_Nuclei"])
data["Bare_Nuclei"] = data["Bare_Nuclei"].fillna(data["Bare_Nuclei"].median())

# Drop the ID column
data.drop("Sample_code_number", axis=1, inplace=True)

# Map classes: Benign=2 → 0, Malignant=4 → 1
data["Class"] = data["Class"].map({2:0, 4:1})

print("Class distribution:\n", data["Class"].value_counts(normalize=True))

Class distribution:
 Class
0    0.655222
1    0.344778
Name: proportion, dtype: float64


In [4]:
#Baseline accuracy 
baseline_acc = data["Class"].value_counts(normalize=True).max()
print("Baseline accuracy (always predicting Benign):", baseline_acc)

# Alternative
prop_benign = (data["Class"] == 0).mean()
print("Proportion of class Benign:", prop_benign)
print("If always predict Benign → accuracy:", round(prop_benign, 2))

Baseline accuracy (always predicting Benign): 0.6552217453505007
Proportion of class Benign: 0.6552217453505007
If always predict Benign → accuracy: 0.66


In [5]:
# prepare features and target
X = data.drop("Class", axis=1)
y = data["Class"]

# Train-Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=43
)


In [6]:
# Fit logistic regression
clf = LogisticRegression(max_iter=1000, solver='liblinear')
clf.fit(X_train, y_train)

print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))


Train accuracy: 0.9660107334525939
Test accuracy: 0.9428571428571428


In [7]:
# Confusion matrix
# Predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Confusion matrices
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

print("Confusion matrix (Train):\n", cm_train)
print("Confusion matrix (Test):\n", cm_test)


Confusion matrix (Train):
 [[357   9]
 [ 10 183]]
Confusion matrix (Test):
 [[90  2]
 [ 6 42]]
