In [None]:
# 1) Install Kaggle API
!pip install -q kaggle

# 2) Upload your Kaggle API key file (kaggle.json)
from google.colab import files
print("Upload your kaggle.json file below:")
files.upload()  # Choose the kaggle.json file when prompted

# 3) Create Kaggle folder & move your API key there
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 4) Download the Breast Cancer Wisconsin dataset
!kaggle datasets download -d uciml/breast-cancer-wisconsin-data

# 5) Unzip the downloaded dataset
!unzip -q breast-cancer-wisconsin-data.zip -d breast_cancer_data

# 6) List the contents to verify
!ls breast_cancer_data


Upload your kaggle.json file below:


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
License(s): CC-BY-NC-SA-4.0
Downloading breast-cancer-wisconsin-data.zip to /content
  0% 0.00/48.6k [00:00<?, ?B/s]
100% 48.6k/48.6k [00:00<00:00, 118MB/s]
data.csv


In [None]:
import pandas as pd

df = pd.read_csv('/content/breast_cancer_data/data.csv')
df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    log_loss,
    matthews_corrcoef,
    cohen_kappa_score,
    classification_report
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [None]:
df = pd.read_csv('/content/breast_cancer_data/data.csv')
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [None]:
comparison_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)
    logloss = log_loss(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    comparison_results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "Specificity": specificity,
        "F1 Score": f1,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "Log Loss": logloss,
        "MCC": mcc,
        "Cohen's Kappa": kappa
    }

    print("=" * 80)
    print(f"MODEL : {name}")
    print("=" * 80)

    print(f"Accuracy               : {accuracy:.4f}")
    print(f"Precision (Malignant)  : {precision:.4f}")
    print(f"Recall / Sensitivity   : {recall:.4f}")
    print(f"Specificity            : {specificity:.4f}")
    print(f"F1 Score               : {f1:.4f}")

    print(f"ROC-AUC                : {roc_auc:.4f}")
    print(f"PR-AUC                 : {pr_auc:.4f}")
    print(f"Log Loss               : {logloss:.4f}")

    print(f"MCC                    : {mcc:.4f}")
    print(f"Cohen’s Kappa          : {kappa:.4f}")

    print("\nConfusion Matrix:")
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\n\n")

MODEL : Logistic Regression
Accuracy               : 0.9649
Precision (Malignant)  : 0.9750
Recall / Sensitivity   : 0.9286
Specificity            : 0.9861
F1 Score               : 0.9512
ROC-AUC                : 0.9960
PR-AUC                 : 0.9943
Log Loss               : 0.0773
MCC                    : 0.9245
Cohen’s Kappa          : 0.9238

Confusion Matrix:
[[71  1]
 [ 3 39]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        72
           1       0.97      0.93      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114




MODEL : Decision Tree
Accuracy               : 0.9298
Precision (Malignant)  : 0.9048
Recall / Sensitivity   : 0.9048
Specificity            : 0.9444
F1 Score               : 0.9048
ROC-AUC                : 0.9246
PR-AUC                 : 0.8537
Log Loss      

In [None]:
comparison_df = pd.DataFrame(comparison_results).T

# Round values for better readability
comparison_df = comparison_df.round(4)

comparison_df

Unnamed: 0,Accuracy,Precision,Recall,Specificity,F1 Score,ROC-AUC,PR-AUC,Log Loss,MCC,Cohen's Kappa
Logistic Regression,0.9649,0.975,0.9286,0.9861,0.9512,0.996,0.9943,0.0773,0.9245,0.9238
Decision Tree,0.9298,0.9048,0.9048,0.9444,0.9048,0.9246,0.8537,2.5294,0.8492,0.8492
Random Forest,0.9649,1.0,0.9048,1.0,0.95,0.9942,0.9911,0.1154,0.9258,0.9231
Gradient Boosting,0.9649,1.0,0.9048,1.0,0.95,0.9947,0.992,0.1279,0.9258,0.9231
