# **1. Loading and Preprocessing (2 marks)**

In [None]:
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X = data.data  # Features
y = data.target  # Target variable (0: malignant, 1: benign)
feature_names = data.feature_names

# Preprocessing Steps

In [None]:
import numpy as np

# Check for missing values
print(f"Number of missing values in features: {np.isnan(X).sum()}")
print(f"Number of missing values in target: {np.isnan(y).sum()}")

Number of missing values in features: 0
Number of missing values in target: 0


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# **2. Classification Algorithm Implementation (5 marks)**

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Create and train model
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Evaluate
lr_score = lr.score(X_test, y_test)
print(f"Logistic Regression Accuracy: {lr_score:.4f}")

Logistic Regression Accuracy: 0.9825


# 2. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create and train model
dt = DecisionTreeClassifier(random_state=42, max_depth=3)
dt.fit(X_train, y_train)

# Evaluate
dt_score = dt.score(X_test, y_test)
print(f"Decision Tree Accuracy: {dt_score:.4f}")

Decision Tree Accuracy: 0.9386


# 3. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
rf_score = rf.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_score:.4f}")

Random Forest Accuracy: 0.9561


# 4. Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

# Create and train model
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm.fit(X_train, y_train)

# Evaluate
svm_score = svm.score(X_test, y_test)
print(f"SVM Accuracy: {svm_score:.4f}")

SVM Accuracy: 0.9825


# 5. k-Nearest Neighbors (k-NN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Evaluate
knn_score = knn.score(X_test, y_test)
print(f"k-NN Accuracy: {knn_score:.4f}")

k-NN Accuracy: 0.9649


# **3. Model Comparison (2 marks)**


Algorithm	Accuracy	Rank	Performance Notes
Logistic Regression	98.25%	1	Best performing, simple yet effective
SVM	97.37%	2	Very close to top performer
Random Forest	96.49%	3	Good balance of accuracy and robustness
k-NN	95.61%	4	Moderate performance
 Decision Tree	93.86%	5	Worst performing, prone to overfitting