"""
================================================================================
STUDENT INFORMATION
================================================================================

BITS ID: 2025AA05840
Name: SHUBAM KUMAR AWASTHI
Email: 2025aa05840@wilp.bits-pilani.ac.in
Date: 15/02/2026

================================================================================
"""

# ML Assignment 2 – Breast Cancer Classification

## Problem Statement
The aim of this assignment is to design, implement, and evaluate multiple machine learning classification models on a biomedical dataset. The task involves building an interactive Streamlit web application that allows users to upload test data, select models, and view evaluation metrics. The project demonstrates an end-to-end ML workflow: data preprocessing, model training, evaluation, UI development, and deployment on Streamlit Community Cloud.

## Dataset Description
We selected the **Breast Cancer Wisconsin (Diagnostic) dataset** from the UCI Machine Learning Repository.  
- **Instances:** 569 patient records  
- **Features:** 30 numeric attributes describing cell nuclei characteristics (e.g., radius, texture, concavity, symmetry)  
- **Target Variable:** Binary classification — Malignant (1) vs. Benign (0)  
- **Rationale:** The dataset is widely used for benchmarking classification algorithms, contains sufficient features (>12), and meets the minimum instance requirement (>500). Its balanced distribution ensures reliable evaluation across models.

In [1]:
# Step 1: Load Dataset
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Target classes:", data.target_names)

# Quick check
X.head()

Shape of X: (569, 30)
Shape of y: (569,)
Target classes: ['malignant' 'benign']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Step 2: Train & Evaluate Models

In [2]:
# Step 2: Train & Evaluate Models (Clean Version)
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Test set class counts:\n", y_test.value_counts())


# Scale features (important for Logistic Regression & kNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
    }
    return metrics

# Run all models on internal test set
results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)

# Convert to DataFrame for easy viewing
results_df = pd.DataFrame(results).T
print("\nEvaluation Metrics:\n", results_df)


Train set shape: (455, 30)
Test set shape: (114, 30)
Test set class counts:
 target
1    72
0    42
Name: count, dtype: int64

Evaluation Metrics:
                      Accuracy  Precision    Recall        F1       MCC  \
Logistic Regression  0.982456   0.986111  0.986111  0.986111  0.962302   
Decision Tree        0.912281   0.955882  0.902778  0.928571  0.817412   
kNN                  0.956140   0.958904  0.972222  0.965517  0.905447   
Naive Bayes          0.929825   0.944444  0.944444  0.944444  0.849206   
Random Forest        0.956140   0.958904  0.972222  0.965517  0.905447   
XGBoost              0.956140   0.946667  0.986111  0.965986  0.905824   

                          AUC  
Logistic Regression  0.995370  
Decision Tree        0.915675  
kNN                  0.978836  
Naive Bayes          0.986772  
Random Forest        0.993882  
XGBoost              0.990079  
