In [1]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 8.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# day_3.py: LightGBM Training and Comparison with XGBoost on Titanic Dataset

# Install required packages (run once in terminal or environment)
# pip install lightgbm optuna
# pip install --upgrade pandas scikit-learn matplotlib seaborn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import lightgbm as lgb
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Verify LightGBM installation
import lightgbm

# === 1. Preprocess Data for LightGBM ===

# Load data
data = pd.read_csv(r'C:\Users\owner\OneDrive\Desktop\train (2).csv')

# Handle missing values
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data = data.drop('Cabin', axis=1)

# Create features
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Sir', 'Don', 'Capt', 'Countess', 'Jonkheer'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['AgeBin'] = pd.cut(data['Age'], bins=[0, 12, 18, 30, 50, 100], labels=[0, 1, 2, 3, 4]).astype(int)

# Encode Sex as binary
data['Sex'] = data['Sex'].map({'male': 1, 'female': 0})

# Drop irrelevant columns
data = data.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

# Split features and target
X = data.drop('Survived', axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify categorical columns for LightGBM
categorical_features = ['Embarked', 'Title']
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# # Visualize categorical features
# plt.figure(figsize=(10, 5))
# plt.subplot(1, 2, 1)
# sns.countplot(x='Title', hue='Survived', data=data)
# plt.title('Survival by Title')
# plt.subplot(1, 2, 2)
# sns.countplot(x='Embarked', hue='Survived', data=data)
# plt.title('Survival by Embarked')
# plt.tight_layout()
# plt.show()


# Define Optuna objective function
def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'random_state': 42
    }
    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train, categorical_feature=categorical_features)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Run Optuna tuning
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)
print('Best Parameters:', study.best_params)
print('Best Accuracy:', study.best_value)

# Train final LightGBM model
lgb_model = lgb.LGBMClassifier(**study.best_params)
lgb_model.fit(X_train, y_train, categorical_feature=categorical_features)
y_pred_lgb = lgb_model.predict(X_test)
lgb_accuracy = accuracy_score(y_test, y_pred_lgb)
print(f'LightGBM Accuracy:', lgb_accuracy)
print('LightGBM Classification Report:\n', classification_report(y_test, y_pred_lgb))


  data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
[I 2025-05-09 15:43:48,055] A new study created in memory with name: no-name-66e05e3a-cd21-4501-a967-67cba0f76993


[I 2025-05-09 15:43:48,131] Trial 0 finished with value: 0.8044692737430168 and parameters: {'learning_rate': 0.27790230411243977, 'max_depth': 5, 'n_estimators': 479, 'subsample': 0.993012907907125, 'colsample_bytree': 0.6073670446101467}. Best is trial 0 with value: 0.8044692737430168.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info

[I 2025-05-09 15:43:48,262] Trial 1 finished with value: 0.8156424581005587 and parameters: {'learning_rate': 0.10001518549718418, 'max_depth': 5, 'n_estimators': 936, 'subsample': 0.6587529985020569, 'colsample_bytree': 0.7572385420145493}. Best is trial 1 with value: 0.8156424581005587.
[I 2025-05-09 15:43:48,430] Trial 2 finished with value: 0.8324022346368715 and parameters: {'learning_rate': 0.21272541478082735, 'max_depth': 7, 'n_estimators': 789, 'subsample': 0.7955318817781503, 'colsample_bytree': 0.7035789584896084}. Best is trial 2 with value: 0.8324022346368715.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostF

[I 2025-05-09 15:43:48,525] Trial 3 finished with value: 0.8156424581005587 and parameters: {'learning_rate': 0.15306497680474515, 'max_depth': 10, 'n_estimators': 377, 'subsample': 0.6676292748923616, 'colsample_bytree': 0.6846334052275247}. Best is trial 2 with value: 0.8324022346368715.
[I 2025-05-09 15:43:48,634] Trial 4 finished with value: 0.7988826815642458 and parameters: {'learning_rate': 0.2709033769586697, 'max_depth': 9, 'n_estimators': 458, 'subsample': 0.9906098188285378, 'colsample_bytree': 0.531408117972216}. Best is trial 2 with value: 0.8324022346368715.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostF

[I 2025-05-09 15:43:48,704] Trial 5 finished with value: 0.8379888268156425 and parameters: {'learning_rate': 0.06629398610905121, 'max_depth': 9, 'n_estimators': 275, 'subsample': 0.8854708715227262, 'colsample_bytree': 0.6104075380125478}. Best is trial 5 with value: 0.8379888268156425.
[I 2025-05-09 15:43:48,777] Trial 6 finished with value: 0.8156424581005587 and parameters: {'learning_rate': 0.109425110772709, 'max_depth': 3, 'n_estimators': 764, 'subsample': 0.5943656824438721, 'colsample_bytree': 0.9462489324692478}. Best is trial 5 with value: 0.8379888268156425.
[I 2025-05-09 15:43:48,805] Trial 7 finished with value: 0.8324022346368715 and parameters: {'learning_rate': 0.15252030726100338, 'max_depth': 8, 'n_estimators': 104, 'subsample': 0.7725886391242263, 'colsample_bytree': 0.8846640395319735}. Best is trial 5 with value: 0.8379888268156425.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostF

[I 2025-05-09 15:43:48,952] Trial 8 finished with value: 0.8100558659217877 and parameters: {'learning_rate': 0.12423856223525562, 'max_depth': 8, 'n_estimators': 679, 'subsample': 0.7323400182338369, 'colsample_bytree': 0.7017726345819593}. Best is trial 5 with value: 0.8379888268156425.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


[I 2025-05-09 15:43:49,110] Trial 9 finished with value: 0.8324022346368715 and parameters: {'learning_rate': 0.039192121943896756, 'max_depth': 7, 'n_estimators': 809, 'subsample': 0.9019628262751643, 'colsample_bytree': 0.8117671134444161}. Best is trial 5 with value: 0.8379888268156425.
[I 2025-05-09 15:43:49,196] Trial 10 finished with value: 0.8324022346368715 and parameters: {'learning_rate': 0.03865895491880185, 'max_depth': 10, 'n_estimators': 206, 'subsample': 0.5001932798125475, 'colsample_bytree': 0.516809123746055}. Best is trial 5 with value: 0.8379888268156425.
[I 2025-05-09 15:43:49,270] Trial 11 finished with value: 0.8268156424581006 and parameters: {'learning_rate': 0.22825399455568593, 'max_depth': 6, 'n_estimators': 294, 'subsample': 0.8402699258030171, 'colsample_bytree': 0.6259655635784646}. Best is trial 5 with value: 0.8379888268156425.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostF

[I 2025-05-09 15:43:49,462] Trial 12 finished with value: 0.8212290502793296 and parameters: {'learning_rate': 0.2193929261497297, 'max_depth': 8, 'n_estimators': 616, 'subsample': 0.8588604049376785, 'colsample_bytree': 0.6200003170476013}. Best is trial 5 with value: 0.8379888268156425.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


[I 2025-05-09 15:43:49,668] Trial 13 finished with value: 0.8100558659217877 and parameters: {'learning_rate': 0.197216865714244, 'max_depth': 7, 'n_estimators': 989, 'subsample': 0.9079404260007852, 'colsample_bytree': 0.8067250893931078}. Best is trial 5 with value: 0.8379888268156425.


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


[I 2025-05-09 15:43:49,822] Trial 14 finished with value: 0.8324022346368715 and parameters: {'learning_rate': 0.01971005089324212, 'max_depth': 9, 'n_estimators': 602, 'subsample': 0.7857871262396403, 'colsample_bytree': 0.6975183558515934}. Best is trial 5 with value: 0.8379888268156425.


Best Parameters: {'learning_rate': 0.06629398610905121, 'max_depth': 9, 'n_estimators': 275, 'subsample': 0.8854708715227262, 'colsample_bytree': 0.6104075380125478}
Best Accuracy: 0.8379888268156425
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
LightGBM Accuracy: 0.8324022346368715
LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       105
           1       0.80      0.80      0.80        74

    accuracy  