In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "conf_matrix": conf_matrix,
        "class_report": class_report
    }

# Print results
for name, metrics in results.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {metrics['accuracy']:.2f}")
    print(f"Precision: {metrics['precision']:.2f}")
    print(f"Recall: {metrics['recall']:.2f}")
    print(f"F1 Score: {metrics['f1']:.2f}")
    print("Confusion Matrix:")
    print(metrics['conf_matrix'])
    print("Classification Report:")
    print(metrics['class_report'])
    print("\n")

from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import pandas as pd

# Add new classifiers to the list
classifiers.update({
    "Gradient Boosting Machine": GradientBoostingClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0)
})

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "conf_matrix": conf_matrix,
        "class_report": class_report
    }

# Create a DataFrame to display the results
metrics_df = pd.DataFrame(results).T[['accuracy', 'precision', 'recall', 'f1']]
print(metrics_df)

Classifier: Logistic Regression
Accuracy: 0.81
Precision: 0.79
Recall: 0.67
F1 Score: 0.72
Confusion Matrix:
[[87 10]
 [19 38]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        97
           1       0.79      0.67      0.72        57

    accuracy                           0.81       154
   macro avg       0.81      0.78      0.79       154
weighted avg       0.81      0.81      0.81       154



Classifier: Support Vector Machine
Accuracy: 0.77
Precision: 0.76
Recall: 0.56
F1 Score: 0.65
Confusion Matrix:
[[87 10]
 [25 32]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83        97
           1       0.76      0.56      0.65        57

    accuracy                           0.77       154
   macro avg       0.77      0.73      0.74       154
weighted avg       0.77      0.77      0.76       154



Classifier: K-Nearest Neighbors
Accur

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Number of positive: 210, number of negative: 403
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 613, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.342577 -> initscore=-0.651829
[LightGBM] [Info] Start training from score -0.651829


Parameters: { "use_label_encoder" } are not used.



                           accuracy precision    recall        f1
Logistic Regression        0.811688  0.791667  0.666667   0.72381
Support Vector Machine     0.772727  0.761905  0.561404  0.646465
K-Nearest Neighbors        0.746753  0.655172  0.666667   0.66087
Random Forest              0.785714   0.76087  0.614035  0.679612
Neural Network              0.62987       0.0       0.0       0.0
Gradient Boosting Machine  0.785714  0.722222  0.684211  0.702703
LightGBM                   0.792208  0.698413   0.77193  0.733333
XGBoost                    0.792208  0.692308  0.789474  0.737705
CatBoost                   0.772727  0.689655  0.701754  0.695652


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000),
    "Gradient Boosting Machine": GradientBoostingClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "conf_matrix": conf_matrix,
        "class_report": class_report
    }

# Create a DataFrame to display the results
metrics_df = pd.DataFrame(results).T[['accuracy', 'precision', 'recall', 'f1']]
print(metrics_df)

[LightGBM] [Info] Number of positive: 210, number of negative: 403
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 613, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.342577 -> initscore=-0.651829
[LightGBM] [Info] Start training from score -0.651829


Parameters: { "use_label_encoder" } are not used.



                           accuracy precision    recall        f1
Logistic Regression        0.811688  0.791667  0.666667   0.72381
Support Vector Machine     0.772727  0.761905  0.561404  0.646465
K-Nearest Neighbors        0.746753  0.655172  0.666667   0.66087
Random Forest              0.798701      0.76  0.666667   0.71028
Neural Network             0.746753  0.640625  0.719298  0.677686
Gradient Boosting Machine  0.779221  0.709091  0.684211  0.696429
LightGBM                   0.792208  0.698413   0.77193  0.733333
XGBoost                    0.792208  0.692308  0.789474  0.737705
CatBoost                   0.772727  0.689655  0.701754  0.695652


In [6]:
import optuna
from sklearn.model_selection import cross_val_score

# Define the objective function for Gradient Boosting Machine
def objective_gbm(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0)
    }
    gbm = GradientBoostingClassifier(**param)
    return cross_val_score(gbm, X_train, y_train, n_jobs=-1, cv=3).mean()

# Define the objective function for CatBoost
def objective_catboost(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1)
    }
    catboost = CatBoostClassifier(**param, verbose=0)
    return cross_val_score(catboost, X_train, y_train, n_jobs=-1, cv=3).mean()

# Optimize GBM
study_gbm = optuna.create_study(direction='maximize')
study_gbm.optimize(objective_gbm, n_trials=50)
best_gbm_params = study_gbm.best_params

# Optimize CatBoost
study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=50)
best_catboost_params = study_catboost.best_params

# Update classifiers with the best parameters
classifiers["Gradient Boosting Machine"] = GradientBoostingClassifier(**best_gbm_params)
classifiers["CatBoost"] = CatBoostClassifier(**best_catboost_params, verbose=0)

# Train and evaluate each classifier again
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "conf_matrix": conf_matrix,
        "class_report": class_report
    }

# Create a DataFrame to display the results
metrics_df = pd.DataFrame(results).T[['accuracy', 'precision', 'recall', 'f1']]
print(metrics_df)

[I 2025-03-15 16:45:29,772] A new study created in memory with name: no-name-55f9c355-f0a9-47fb-8395-da08999acaa4
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0)
[I 2025-03-15 16:45:34,595] Trial 0 finished with value: 0.717758648174717 and parameters: {'n_estimators': 127, 'max_depth': 8, 'learning_rate': 0.0039234912081604735, 'subsample': 0.5638925633635502}. Best is trial 0 with value: 0.717758648174717.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0)
[I 2025-03-15 16:45:36,404] Trial 1 finished with value: 0.6574206918539773 and parameters: {'n_estimators': 194, 'max_depth': 3, 'learning_rate': 0.0002611243458513176, 'subsample': 0.5569267931882576}. Best is trial 0 with value: 0.717758648174717.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('subsa

[LightGBM] [Info] Number of positive: 210, number of negative: 403
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 613, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.342577 -> initscore=-0.651829
[LightGBM] [Info] Start training from score -0.651829


Parameters: { "use_label_encoder" } are not used.



                           accuracy precision    recall        f1
Logistic Regression        0.811688  0.791667  0.666667   0.72381
Support Vector Machine     0.772727  0.761905  0.561404  0.646465
K-Nearest Neighbors        0.746753  0.655172  0.666667   0.66087
Random Forest              0.792208  0.745098  0.666667  0.703704
Neural Network             0.772727      0.75  0.578947  0.653465
Gradient Boosting Machine  0.785714   0.76087  0.614035  0.679612
LightGBM                   0.792208  0.698413   0.77193  0.733333
XGBoost                    0.792208  0.692308  0.789474  0.737705
CatBoost                   0.792208  0.755102  0.649123  0.698113
