In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, roc_auc_score
from sklearn.metrics import recall_score, f1_score, precision_score, matthews_corrcoef, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, Perceptron, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
import category_encoders as ce
import warnings
warnings.filterwarnings("ignore")

In [7]:
# Load and preprocess the dataset
titanic = pd.read_csv("./sample_data/train.csv")
titanic = titanic.drop(["PassengerId", "Name", "Cabin", "Ticket"], axis=1)
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())
titanic["Embarked"] = titanic["Embarked"].fillna(titanic["Embarked"].mode()[0])
X = titanic.drop(["Survived"], axis=1)
y = titanic["Survived"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)
# Encode categorical variables
encoder = ce.OrdinalEncoder(["Sex", "Embarked"])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
# Define models
models = {
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier(criterion="entropy", n_estimators=100),
"LightGBM": lgb.LGBMClassifier(),
"Ridge Classifier CV": RidgeClassifierCV(),
"XGBoost": XGBClassifier(),
"Nearest Centroid": NearestCentroid(),
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
"Calibrated Classifier CV": CalibratedClassifierCV(),
"Bernoulli NB": BernoulliNB(),
"Bagging Classifier": BaggingClassifier(),
"SVC": SVC(),
"Linear SVC": LinearSVC(),
"KNeighbors Classifier": KNeighborsClassifier(),
"Gaussian NB": GaussianNB(),
"Perceptron": Perceptron(),
"SGD Classifier": SGDClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"MLP Classifier": MLPClassifier(),
"Extra Trees": ExtraTreesClassifier(),
"AdaBoost": AdaBoostClassifier(),
"Nu SVC": NuSVC(),
"Gaussian Process": GaussianProcessClassifier(kernel=RBF()),
"Ridge Classifier": RidgeClassifier(),
"Passive Aggressive": PassiveAggressiveClassifier(),
"Hist Gradient Boosting": HistGradientBoostingClassifier()
}

In [11]:
# Train and predict
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculate metrics
    CM = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = CM.ravel()
    specificity = TN / (TN + FP)
    loss_log = log_loss(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mathew = matthews_corrcoef(y_test, y_pred)
    results.append([name, acc, balanced_acc, prec, rec, specificity, f1, roc, loss_log, mathew])
    model_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Balanced Accuracy", "Precision", "Recall", "Sensitivity", "F1", "ROC", "Log Loss", "Mathew"])
    model_results = model_results.sort_values("F1", ascending=False).reset_index(drop=True)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [12]:
model_results

Unnamed: 0,Model,Accuracy,Balanced Accuracy,Precision,Recall,Sensitivity,F1,ROC,Log Loss,Mathew
0,Random Forest,0.837989,0.825997,0.835821,0.756757,0.895238,0.794326,0.825997,5.839475,0.66345
1,XGBoost,0.832402,0.821236,0.823529,0.756757,0.885714,0.788732,0.821236,6.040836,0.651851
2,Extra Trees,0.826816,0.818468,0.802817,0.77027,0.866667,0.786207,0.818468,6.242197,0.641159
3,MLP Classifier,0.821229,0.809717,0.808824,0.743243,0.87619,0.774648,0.809717,6.443558,0.628477
4,LightGBM,0.821229,0.807722,0.818182,0.72973,0.885714,0.771429,0.807722,6.443558,0.628185
5,Gaussian NB,0.815642,0.80296,0.80597,0.72973,0.87619,0.765957,0.80296,6.644919,0.616566
6,Linear SVC,0.815642,0.800965,0.815385,0.716216,0.885714,0.76259,0.800965,6.644919,0.616379
7,Calibrated Classifier CV,0.815642,0.800965,0.815385,0.716216,0.885714,0.76259,0.800965,6.644919,0.616379
8,Hist Gradient Boosting,0.810056,0.798198,0.794118,0.72973,0.866667,0.760563,0.798198,6.846281,0.605103
9,Logistic Regression,0.810056,0.794208,0.8125,0.702703,0.885714,0.753623,0.794208,6.846281,0.604584
