1. Loading and Prepoessing

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [5]:
# Check for missing values
missing_values = X.isnull().sum().sum()
print(f" missing values: {missing_values}")

 missing values: 0


In [6]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

2. Classification Algorithm Implementation

In [7]:
# Logistic Regression
# Logistic Regression is a supervised learning algorithm used for binary classification problems.It calculates the probability that a given input
# belongs to a class using the sigmoid (logistic) function.

#How it works:
# Logistic Regression models the probability that a data point belongs to a particular class using the logistic (sigmoid) function.
# It's a linear classifier that estimates coefficients to fit a hyperplane separating the classes.

# Why suitable for this dataset?
#The Breast Cancer dataset is a binary classification problem (Malignant = 0, Benign = 1). Logistic Regression is ideal when:
# - The relationship between features and the target is linear.
# - You want a simple and interpretable model.

In [8]:
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)

In [9]:
lr_model = lr_model.predict(X_test)
lr_model

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0])

In [10]:
## Classification report
print("Classification Report:\n", classification_report(y_test, lr_model))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [11]:
# Confusion matrix
model = LogisticRegression()
model.fit(X_train, y_train)
result = model.predict(X_test)
print(confusion_matrix(y_test,result))

[[41  2]
 [ 1 70]]


In [15]:
# Desicion Tree Classifier
# A Decision Tree is a flowchart-like structure. It splits the dataset based on feature values using rules (like Gini Index or Entropy) and assigns
# labels at the leaf nodes.

# How it works:
# Logistic Regression models the probability that a data point belongs to a particular class using the logistic (sigmoid) function. It's a linear
# classifier that estimates coefficients to fit a hyperplane separating the classes.

# Why suitable for this dataset?
# - Handles both numerical and categorical data.
# - Easy to interpret.
# - Works well even if data is not scaled (though we already scaled it).
# - Can capture non-linear patterns between features and output.

In [16]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

In [17]:
dt_result = dtc.predict(X_test)
dt_result

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0])

In [18]:
# Classification report
print("Classification Report:\n", classification_report(y_test, dt_result))

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [19]:
# Confusion report
print(confusion_matrix(y_test, dt_result))

[[40  3]
 [ 3 68]]


In [20]:
# Random Forest Classifier(RFC)
# A Random Forest is an ensemble learning method that builds multiple decision trees and combines their outputs. Each tree is trained on a random subset
# of the data and features. The final prediction is usually made by majority voting.

# How it works:
# An ensemble of decision trees trained on different random subsets of the data and features. Predictions are made by majority voting across trees.

# Why suitable for this dataset?
# - Handles non-linear relationships well.
# - Less prone to overfitting than a single decision tree.
# - Performs well on medium-sized datasets like breast cancer.
# - Works well with imbalanced or noisy data.

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfr_model = RandomForestClassifier()
rfr_model.fit(X_train, y_train)

In [22]:
rfr_result=rfr_model.predict(X_test)
rfr_result

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0])

In [23]:
# Classification report
print("Classification Report:\n", classification_report(y_test, rfr_result))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [25]:
# Confusion matrix
confusion_matrix(y_test,rfr_model.predict(X_test))

array([[40,  3],
       [ 1, 70]])

In [None]:
# Support Vector Machine (SVM)
#A Support Vector Machine (SVM) is a supervised machine learning algorithm that tries to find the best boundary (hyperplane) that separates the classes with the maximum margin.
# In case the data is not linearly separable, SVM can use kernel tricks to transform the data into higher dimensions.

# How it works:
# SVM finds the optimal hyperplane that maximally separates the two classes in the feature space.
# It can also handle non-linear data using kernel functions.

# Why suitable for this dataset?
# - Works well for binary classification (like benign vs malignant).
# - Effective in high-dimensional spaces.
# - Performs best when data is properly scaled (which we've done).
# - Resistant to overfitting, especially when using regularization.

In [27]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [28]:
svc_result=svc_model.predict(X_test)
svc_result

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0])

In [29]:
# Classification report
print(classification_report(y_test,svc_result))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [30]:
# Confusion Matrix
confusion_matrix(y_test,svc_result)

array([[41,  2],
       [ 1, 70]])

In [None]:
# k-Nearest Neighbors (k-NN)
# k-Nearest Neighbors (k-NN) is a simple instance-based learning algorithm. It classifies a new data point based on the majority label of its k closest neighbors in the training dataset (using distance measures like Euclidean
# distance).

# How it works:
# For a new data point, k-NN looks at the k closest training examples (based on distance) and predicts the majority class among them.

# Why suitable for this dataset?
# - Effective on small to medium-sized datasets like Breast Cancer.
# - Makes no assumption about the data distribution.
# - Works well if features are scaled (which we’ve done using StandardScaler).
# - Easy to implement and understand.

In [31]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [33]:
knn_result=knn_model.predict(X_test)
knn_result

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0])

In [34]:
# Classification Report
print(classification_report(y_test,knn_result))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [35]:
# Confusion Matrix
confusion_matrix(y_test,knn_result)

array([[40,  3],
       [ 3, 68]])

3. Model Comparison

In [36]:
from sklearn.metrics import accuracy_score

# Define model names and instances in a list
model_list = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("SVM", SVC()),
    ("k-NN", KNeighborsClassifier())
]

# Train, predict, and store results
results = []

for name, model in model_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append((name, acc))

# Create a DataFrame to display results
import pandas as pd

df_results = pd.DataFrame(results, columns=["Model", "Accuracy"])
df_results = df_results.sort_values("Accuracy", ascending=False)

print(df_results)

                 Model  Accuracy
0  Logistic Regression  0.973684
3                  SVM  0.973684
2        Random Forest  0.964912
4                 k-NN  0.947368
1        Decision Tree  0.938596


In [40]:
best_performance = df_results.iloc[0]
worst_performance = df_results.iloc[-1]
print(f" Best_performance: {best_performance['Model']} ({best_performance['Accuracy']:.4f})")
print(f" Worst_performance: {worst_performance['Model']} ({worst_performance['Accuracy']:.4f})")

 Best_performance: Logistic Regression (0.9737)
 Worst_performance: Decision Tree (0.9386)
