In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib


Load Dataset

In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)

# take only 10 rows as test data
df.head(10).to_csv("test_data.csv", index=False)


X = data.data
y = data.target

Train Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Evaluation Function

In [4]:
def evaluate(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    return metrics


Train Models

Logistic Regression

In [5]:
lr = LogisticRegression(max_iter=4000)
lr.fit(X_train, y_train)
lr_metrics = evaluate(lr)

Decision Tree

In [6]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_metrics = evaluate(dt)

KNN

In [7]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_metrics = evaluate(knn)

Naive Bayes

In [8]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_metrics = evaluate(nb)

Random Forest

In [9]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_metrics = evaluate(rf)

XGBoost

In [10]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_metrics = evaluate(xgb)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Save Models

In [11]:
import os

os.makedirs("model", exist_ok=True)

In [12]:
joblib.dump(lr, "model/logistic.pkl")
joblib.dump(dt, "model/dt.pkl")
joblib.dump(knn, "model/knn.pkl")
joblib.dump(nb, "model/nb.pkl")
joblib.dump(rf, "model/rf.pkl")
joblib.dump(xgb, "model/xgb.pkl")

['model/xgb.pkl']

Comparison Table

In [13]:
results = pd.DataFrame([
    ["Logistic", *lr_metrics.values()],
    ["DecisionTree", *dt_metrics.values()],
    ["KNN", *knn_metrics.values()],
    ["NaiveBayes", *nb_metrics.values()],
    ["RandomForest", *rf_metrics.values()],
    ["XGBoost", *xgb_metrics.values()],
],
columns=["Model","Accuracy","Precision","Recall","F1","AUC","MCC"])

results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC,MCC
0,Logistic,0.95614,0.945946,0.985915,0.965517,0.997707,0.906811
1,DecisionTree,0.938596,0.944444,0.957746,0.951049,0.932362,0.86886
2,KNN,0.95614,0.934211,1.0,0.965986,0.995906,0.908615
3,NaiveBayes,0.973684,0.959459,1.0,0.97931,0.998362,0.944733
4,RandomForest,0.964912,0.958904,0.985915,0.972222,0.994432,0.925285
5,XGBoost,0.95614,0.958333,0.971831,0.965035,0.990829,0.906379


In [14]:
import json

all_metrics = {
    "Logistic Regression": lr_metrics,
    "Decision Tree": dt_metrics,
    "KNN": knn_metrics,
    "Naive Bayes": nb_metrics,
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}

with open("metrics.json", "w") as f:
    json.dump(all_metrics, f)
