# Machine Learning - ASSIGNMENT 2: CLASSIFICATION

## Implement multiple classification models - 
## Build an interactive Streamlit web application to demonstrate your models

STUDENT INFORMATION (REQUIRED - DO NOT DELETE)

BITS ID : [2025ab05083]

Name    : [SUJOY SAHA]

Email   : [2025ab05083@wilp.bits-pilani.ac.in]

Date    : [15-02-2026]

In [2]:
# Import Required Libraries

import numpy as np
import pandas as pd
import pickle

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### 1.1 Dataset Selection and Loading

TODO: Load your chosen dataset

In [3]:
data_source_public = "https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data"
# ----------------------------
# Load Dataset
# ----------------------------
data_bc = load_breast_cancer()
X = pd.DataFrame(data_bc.data, columns=data_bc.feature_names)
y = data_bc.target
print(data_bc.target_names)

['malignant' 'benign']


### 1.2 Train-Test Split and Feature Scaling

In [4]:
# ----------------------------
# Train Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ----------------------------
# 3. Feature Scaling
# ----------------------------
scaler  = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


### 1.3 Define the Models

In [5]:
# ----------------------------
# Models
# ----------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree"      : DecisionTreeClassifier(),
    "KNN"                : KNeighborsClassifier(),
    "Naive Bayes"        : GaussianNB(),
    "Random Forest"      : RandomForestClassifier(),
    "XGBoost"            : XGBClassifier(eval_metric="logloss")
}
results = {}

### 2.1 Train the model

In [6]:
# ----------------------------
# Train & Evaluate
# ----------------------------
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_prob = model.predict_proba(X_test)[:, 1]

    results[name] = {
        "Accuracy"    : accuracy_score(y_test, y_pred),
        "AUC"         : roc_auc_score(y_test, y_prob),
        "Precision"   : precision_score(y_test, y_pred),
        "Recall"      : recall_score(y_test, y_pred),
        "F1"          : f1_score(y_test, y_pred),
        "MCC"         : matthews_corrcoef(y_test, y_pred)
    }

## Save the model parameters

In [9]:
# ----------------------------
# Save models & scaler
# ----------------------------
with open("model/saved_models.pkl", "wb") as f:
    pickle.dump(models, f)

with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(pd.DataFrame(results).T)

                     Accuracy       AUC  Precision    Recall        F1  \
Logistic Regression  0.982456  0.995370   0.986111  0.986111  0.986111   
Decision Tree        0.912281  0.915675   0.955882  0.902778  0.928571   
KNN                  0.956140  0.978836   0.958904  0.972222  0.965517   
Naive Bayes          0.929825  0.986772   0.944444  0.944444  0.944444   
Random Forest        0.956140  0.992063   0.958904  0.972222  0.965517   
XGBoost              0.956140  0.990079   0.946667  0.986111  0.965986   

                          MCC  
Logistic Regression  0.962302  
Decision Tree        0.817412  
KNN                  0.905447  
Naive Bayes          0.849206  
Random Forest        0.905447  
XGBoost              0.905824  


In [10]:
"""
Observation & Analysis:

Because the dataset is small and nearly linearly separable. Complex ensemble models do not gain significant advantage 
over Logistic Regression in such cases.
"""

'\nObservation & Analysis:\n\nBecause the dataset is small and nearly linearly separable. Complex ensemble models do not gain significant advantage \nover Logistic Regression in such cases.\n'