In [1]:
# !pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import  train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve,roc_auc_score, auc, precision_score, recall_score, f1_score,matthews_corrcoef
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import pickle


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset : Breast Cancer Wisconsin (Diagnostic) Data Set
# Dataset URL : https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
ds = pd.read_csv("data.csv")
print(ds.info())

# Rows, cols
print("Records ",ds.shape[0], " Cols ",ds.shape[1])

# remove empty contents 
ds = ds.drop(['id','Unnamed: 32'],axis=1)
ds.isna().sum()
ds.dropna()

# Features and target
X = ds.drop(columns=["diagnosis"])

y = ds["diagnosis"]

# Encode labels
le = LabelEncoder()
y = pd.Series(le.fit_transform(y), name="diagnosis") 

# Split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y )

# Combine test features and labels
test_df = pd.concat([X_test, y_test], axis=1)

# Save test data to file
test_df.to_csv("test_data.csv", index=False)

# Dictionary result will be stored. This result shall be used in streamlit
result = { }

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)   

def do_evaluation_metrics(y_test, y_pred):
    # Dictionary 
    metrics = {}
   
    # Accuracy
    metrics["accuracy"] = accuracy_score(y_test,y_pred)
    # AUC Score
    metrics["auc_score"] =  roc_auc_score(y_test,y_pred)
    # Precision
    metrics["precision"] = precision_score(y_test,y_pred, average='binary')
    # Recall
    metrics["recall"] = recall_score(y_test,y_pred, average='binary')
    #f1 score
    metrics["f1_score_result"] = f1_score(y_test,y_pred)
    # MCC Score
    metrics["mcc_score"] = matthews_corrcoef(y_test, y_pred)
    
    return metrics

def do_print_metrics(dict):
    print(f"    Accuracy  |     AUC     |    Precision  |    Recall  |    F1    |    MCC   | ")
    print(f"{dict["accuracy"]:.4f}    |   {dict["auc_score"]:.4f} | {dict["precision"]:.4f}  | {dict["recall"]:.4f} | {dict["f1_score_result"]:.4f} |  {dict["mcc_score"]:.4f}   |")
    pass  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [3]:
def build_logistics_regression_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data
    
    # Model
    lg = LogisticRegression(max_iter=10000, class_weight='balanced')
    lg.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = lg.predict(X_test_scaled)

    result = do_evaluation_metrics(y_test, y_pred) 
    result["model"] = lg
    result['name'] = "Logistic Regression"
    result['y_test'] = y_test
    result['y_pred'] = y_pred
    
    pickle.dump(result, open("logistic_model.pkl", "wb"))
    # Evaluation metrics
    return result

def build_decision_tree_classifier_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data

    # Define parameter grid
    param_grid = {
        'max_depth': [3,5,7,10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1,2, 4,8]
    }

    model = DecisionTreeClassifier(  max_depth=None, random_state=42 )
    # GridSearch
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    accuracy = best_model.score(X_test, y_test)
    # Predict    
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    # Evaluation metrics
    result  = do_evaluation_metrics(y_test, y_pred)
    result['accuracy'] = accuracy
    result["auc_score"] =  roc_auc_score(y_test, y_prob)
    result['model'] = best_model
    result['name'] = "Decision Tree"

    result['y_test'] = y_test
    result['y_pred'] = y_pred
    
    # print(result)
    pickle.dump(result, open("decision_tree_model.pkl", "wb"))
    return result
    
def build_KNN_classifier_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data

    # Train model
    model = KNeighborsClassifier(n_neighbors=3)    
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)
  
    # Evaluation metrics
    dict =  do_evaluation_metrics(y_test, y_pred)
    dict['name'] ="KNN"
    dict['model'] = model
    dict['y_test'] = y_test
    dict['y_pred'] = y_pred
    

    pickle.dump(dict, open("KNN_classifier.pkl", "wb"))
    
    return dict;


In [4]:

def build_NBGaussion_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data

    # Train model
    model = GaussianNB()    
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluation metrics
    dict = do_evaluation_metrics(y_test, y_pred)
    dict['name'] = "Naive Bayes"
    dict['model'] = model
    dict['y_test'] = y_test
    dict['y_pred'] = y_pred
    
    pickle.dump(dict, open("Naive_bayes_Gaussian_model.pkl", "wb"))
    
    return dict

def build_random_forest_classifier_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data

    # Define parameter grid
    params_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5]
    }
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')
    # GridSearch
    grid = GridSearchCV(
        estimator=model,
        param_grid=params_grid,
        cv=5,
        scoring='recall',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    accuracy = best_model.score(X_test, y_test)
    
    # Predict    
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    # Evaluation metrics
    dict  = do_evaluation_metrics(y_test, y_pred)
    dict['accuracy'] = accuracy
    dict['model'] = best_model
    dict["auc_score"] =  roc_auc_score(y_test, y_prob)
    dict['name'] = "Random Forest"
    dict['y_test'] = y_test
    dict['y_pred'] = y_pred
    
    # For streamlit app, export this model 
    pickle.dump(dict, open("random_forest_model.pkl", "wb"))
    
    return dict

def build_XGBoost_model(X_train_scaled,y_train,X_test,y_test):
    # Input params : 
    #  X_train and y_train - Train Data
    #  X_test and y_test - Test data

    # Train model
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        random_state=42,        
        eval_metric='logloss'
    )
    
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)

    
    # Evaluation metrics
    dict = do_evaluation_metrics(y_test, y_pred)   
    dict['name'] = "XGBoost Classifier"
    dict['model'] = model

    dict['y_test'] = y_test
    dict['y_pred'] = y_pred
    
    # For streamlit app, export this model 
    pickle.dump(dict, open("XGB_classifier_model.pkl", "wb"))
  
    return dict


In [5]:
# Logistic Regression
dict_lg_score = build_logistics_regression_model(X_train_scaled, y_train, X_test, y_test)
result['LG'] = dict_lg_score

# Decision Tree
dict_dt_score = build_decision_tree_classifier_model(X_train_scaled, y_train, X_test, y_test)
result['DT'] = dict_dt_score


# print("KNN Classifier Model ")
# KNN Classifier
dict_knn_score = build_KNN_classifier_model(X_train_scaled, y_train, X_test, y_test)
result['KNN'] = dict_knn_score

# Naive Bayes Gaussian Classifier
dict_nbg_score = build_NBGaussion_model(X_train_scaled, y_train, X_test, y_test)
result['NBG'] = dict_nbg_score

# Random Forest
dict_rf_score = build_random_forest_classifier_model(X_train_scaled, y_train, X_test, y_test)
result['RF'] = dict_rf_score

# XGBoost
dict_xgb_score = build_XGBoost_model(X_train_scaled, y_train, X_test, y_test)
result['XGB'] = dict_xgb_score

print ("done")

done


In [6]:
from tabulate import tabulate
print(tabulate(
    [
        ['Logistics Regression',dict_lg_score["accuracy"],dict_lg_score["auc_score"],dict_lg_score["precision"], dict_lg_score["recall"],dict_lg_score["f1_score_result"],dict_lg_score["mcc_score"]], 
        ['Decision Tree',dict_dt_score["accuracy"],dict_dt_score["auc_score"],dict_dt_score["precision"], dict_dt_score["recall"],dict_dt_score["f1_score_result"],dict_dt_score["mcc_score"]],
        ['KNN',dict_knn_score["accuracy"],dict_knn_score["auc_score"],dict_knn_score["precision"], dict_knn_score["recall"],dict_knn_score["f1_score_result"],dict_knn_score["mcc_score"]],
        ['Naive Bayes',dict_nbg_score["accuracy"],dict_nbg_score["auc_score"],dict_nbg_score["precision"], dict_nbg_score["recall"],dict_nbg_score["f1_score_result"],dict_nbg_score["mcc_score"]],
        ['Random Forest',dict_rf_score["accuracy"],dict_rf_score["auc_score"],dict_rf_score["precision"], dict_rf_score["recall"],dict_rf_score["f1_score_result"],dict_rf_score["mcc_score"]],
        ['XGBoost',dict_xgb_score["accuracy"],dict_xgb_score["auc_score"],dict_xgb_score["precision"], dict_xgb_score["recall"],dict_xgb_score["f1_score_result"],dict_xgb_score["mcc_score"]]
    ],
    headers=['ML Model Name', 'Accuracy','AUC','Precision','Recall','F1','MCC']
))

ML Model Name           Accuracy       AUC    Precision    Recall        F1       MCC
--------------------  ----------  --------  -----------  --------  --------  --------
Logistics Regression    0.973684  0.969246     0.97561   0.952381  0.963855  0.94334
Decision Tree           0.938596  0.922454     0.948718  0.880952  0.91358   0.867493
KNN                     0.938596  0.921627     0.972973  0.857143  0.911392  0.868766
Naive Bayes             0.921053  0.907738     0.923077  0.857143  0.888889  0.829162
Random Forest           0.973684  0.997354     1         0.928571  0.962963  0.944155
XGBoost                 0.964912  0.952381     1         0.904762  0.95      0.92582
