# Machine Learning - ASSIGNMENT 2: CLASSIFICATION

## Implement multiple classification models - 
## Build an interactive Streamlit web application to demonstrate your models

STUDENT INFORMATION (REQUIRED - DO NOT DELETE)

BITS ID : [2025ab05083]

Name    : [SUJOY SAHA]

Email   : [2025ab05083@wilp.bits-pilani.ac.in]

Date    : [15-02-2026]

In [34]:
# Import Required Libraries

import numpy as np
import pandas as pd
import pickle

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### 1.1 Dataset Selection and Loading

TODO: Load your chosen dataset

In [35]:
data_source_public = "https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data"
# Load CSV
data = pd.read_csv("breast_cancer_data.csv")

# Remove unwanted columns
data = data.drop(columns=["id"], errors="ignore")
data = data.loc[:, ~data.columns.str.contains("^Unnamed")]

# Fix diagnosis
data["diagnosis"] = data["diagnosis"].str.upper()
data["diagnosis"] = data["diagnosis"].map({"M": 0, "B": 1})

# Define X and y
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

In [36]:
#data.head()
#data.isnull().sum()

### 1.2 Train-Test Split and Feature Scaling

In [37]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

### 1.3 Define the Models

In [38]:
# ----------------------------
# Models
# ----------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree"      : DecisionTreeClassifier(),
    "KNN"                : KNeighborsClassifier(),
    "Naive Bayes"        : GaussianNB(),
    "Random Forest"      : RandomForestClassifier(),
    "XGBoost"            : XGBClassifier(eval_metric="logloss")
}
results = {}

### 2.1 Train the model

In [39]:
# ----------------------------
# Train & Evaluate
# ----------------------------
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        "Accuracy"    : accuracy_score(y_test, y_pred),
        "AUC"         : roc_auc_score(y_test, y_prob),
        "Precision"   : precision_score(y_test, y_pred),
        "Recall"      : recall_score(y_test, y_pred),
        "F1"          : f1_score(y_test, y_pred),
        "MCC"         : matthews_corrcoef(y_test, y_pred)
    }

## Save the model parameters

In [40]:
# ----------------------------
# Save models & scaler
# ----------------------------
with open("model/saved_models.pkl", "wb") as f:
    pickle.dump(models, f)

with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(pd.DataFrame(results).T)

                     Accuracy       AUC  Precision    Recall        F1  \
Logistic Regression  0.973684  0.997380   0.972222  0.985915  0.979021   
Decision Tree        0.947368  0.943990   0.957746  0.957746  0.957746   
KNN                  0.947368  0.981985   0.957746  0.957746  0.957746   
Naive Bayes          0.964912  0.997380   0.958904  0.985915  0.972222   
Random Forest        0.964912  0.995087   0.958904  0.985915  0.972222   
XGBoost              0.956140  0.990829   0.958333  0.971831  0.965035   

                          MCC  
Logistic Regression  0.943898  
Decision Tree        0.887979  
KNN                  0.887979  
Naive Bayes          0.925285  
Random Forest        0.925285  
XGBoost              0.906379  


In [41]:
"""
Observation & Analysis:

Because the dataset is small and nearly linearly separable. Complex ensemble models do not gain significant advantage 
over Logistic Regression in such cases.
"""

'\nObservation & Analysis:\n\nBecause the dataset is small and nearly linearly separable. Complex ensemble models do not gain significant advantage \nover Logistic Regression in such cases.\n'

In [42]:
with open("model/saved_models.pkl", "rb") as f:
    loaded_models = pickle.load(f)

print(loaded_models.keys())

dict_keys(['Logistic Regression', 'Decision Tree', 'KNN', 'Naive Bayes', 'Random Forest', 'XGBoost'])
