In [2]:
# Breast Cancer Prediction 
# Using Logistic Regression with Scikit-learn
# Written by: Prakash R. Kota
# Written on: 06 Feb 2025
# Last update: 11 Feb 2025

# Data Set from
# Original: 
#     https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
# With Header:
#     https://www.kaggle.com/code/nancyalaswad90/analysis-breast-cancer-prediction-dataset

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [4]:
# Import the raw data and set up the 
# Training set and Test set

import numpy as np
import pandas as pd

# Load the CSV file (ensure the header is recognized)
df = pd.read_csv("./data/breastcancer.csv")

# Extract headers (column names)
headers = df.columns.tolist()  # Stores header names in a list

# Extract X (all columns except the columns 0 and 1)
X = df.iloc[:, 2:].values  # Converts to NumPy array

# Extract Y (column 1 only)
Y_tmp = df.iloc[:, 1].values   # Converts to NumPy array
# Replace 'M' with 1.0 and 'B' with 0.0
Y = np.where(Y_tmp == "M", 1.0, 0.0).astype(np.float64)

In [5]:
# X has 569 Training sets
# Select a subset from rows 27 to 569,
# that is index rows 26 to 569
# Make this the training set
X_train = X[26:570, :]
y_train = Y[26:570]

# Also create the Test set from the first 25 rows
X_test = X[0:26, :]
y_test = Y[0:26]

In [6]:
# SCALING of data is very important
# for convergence of the algorithm and for a good fit

# Initialize StandardScaler
scaler = StandardScaler()

# Fit on Training data and also transform
X_train_scaled = scaler.fit_transform(X_train)
# Scale the Test data using X_train parameters
X_test_scaled = scaler.transform(X_test)  

In [7]:
# Create the Logistic Regression model
# and fit the Training Set
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

In [8]:
# Make Predictions with the known Training Set

# From the above Logistic Regression Model
# Predict the first 25 outputs based on the Training set inputs
print(f"""Print the Prediction for the
first 25 rows of the Training set""")
y_pred_train_raw = lr_model.predict(X_train_scaled)
# Convert probabilities to binary class labels (0 or 1)
y_pred_train = (y_pred_train_raw > 0.5).astype(int)
y_pred_train_list = y_pred_train.flatten().tolist()
print(y_pred_train_list[:25])

# Compare the Actual output values from the Training set
print(f"""Print the Actual known outputs for the
first 25 rows of the Training set""")
y_train_list = y_train.astype(int).tolist()
print(y_train_list[:25])

# Find the Accuracy - Real vs Prediction, if real is known
# as in the case of the Training set
# Compute accuracy
accuracy_train = np.mean(y_pred_train.flatten() == y_train.flatten()) # Flatten to match shapes
 
print(f"Training set Prediction Accuracy: {accuracy_train*100:.2f}%")
print("")

# Alternate syntax for Accuracy calculation
# print("Training set Predicion Acuracy:", lr_model.score(X_train_scaled, y_train))

Print the Prediction for the
first 25 rows of the Training set
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]
Print the Actual known outputs for the
first 25 rows of the Training set
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]
Training set Prediction Accuracy: 98.71%



In [11]:
# Make Predictions with the known Test Set

# From the above Tensorflow Model
# Predict the outputs based on the Test set inputs
print(f"""Print the Prediction for all the
26 rows of the Test set""")
y_pred_test_raw = lr_model.predict(X_test_scaled)
# Convert probabilities to binary class labels (0 or 1)
y_pred_test = (y_pred_test_raw > 0.5).astype(int)
y_pred_test_list = y_pred_test.flatten().tolist()
print(y_pred_test_list)

# Compare the Actual output values from the Test set
print(f"""Print the Actual known outputs for the
26 rows of the Test set""")
y_test_list = y_test.astype(int).tolist()
print(y_test_list)


# Find the Accuracy - Real vs Prediction, if real is known
# as in the case of the Test set
# Compute accuracy
accuracy_test = np.mean(y_pred_test.flatten() == y_test.flatten()) # Flatten to match shapes
 
print(f"Test set Prediction Accuracy: {accuracy_test*100:.2f}%")
print("")

# Alternate syntax for Accuracy calculation
# print("Test set Predicion Acuracy:", lr_model.score(X_test_scaled, y_test))

Print the Prediction for all the
26 rows of the Test set
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]
Print the Actual known outputs for the
26 rows of the Test set
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]
Test set Prediction Accuracy: 100.00%

