In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Titanic dataset from seaborn
import seaborn as sns
df = sns.load_dataset("titanic")

# Drop rows with missing target
df = df.dropna(subset=['survived'])

# Select features and target
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features]
y = df['survived']

# Handle missing values simply
X['age'].fillna(X['age'].median(), inplace=True)
X['embarked'].fillna('S', inplace=True)  # most common

# Identify categorical columns
cat_features = ['pclass', 'sex', 'embarked']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['age'].fillna(X['age'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'].fillna(X['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(val

In [25]:
#X['sex'] = X['sex'].replace({'male':1, 'female':0})
X = pd.get_dummies(X, columns=['embarked'], dtype='int')
X = pd.get_dummies(X, columns=['sex'], dtype='int')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X

Unnamed: 0,pclass,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,3,22.0,1,0,7.2500,0,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,26.0,0,0,7.9250,0,0,1,1,0
3,1,35.0,1,0,53.1000,0,0,1,1,0
4,3,35.0,0,0,8.0500,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,1,0,1
887,1,19.0,0,0,30.0000,0,0,1,1,0
888,3,28.0,1,2,23.4500,0,0,1,1,0
889,1,26.0,0,0,30.0000,1,0,0,0,1


In [27]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score

In [28]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=98,
    max_depth=12,
    learning_rate=0.1,
    reg_lambda=1.15,
    subsample=0.95,
    colsample_bytree=1,
    random_state=43
)

lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
print("LightGBM Accuracy:", accuracy_score(y_test, lgb_pred))

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
LightGBM Accuracy: 0.8491620111731844


In [29]:
print("\nClassification Report (LightGBM):")
print(classification_report(y_test, lgb_pred))


Classification Report (LightGBM):
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       105
           1       0.83      0.80      0.81        74

    accuracy                           0.85       179
   macro avg       0.85      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179



# Saving the Model

## 1. Using Joblib

In [30]:
import joblib

In [31]:
# Save the model
joblib.dump(lgb_model, "lightgbm_model.pkl")

['lightgbm_model.pkl']

## 2. Using LGBM Default

In [32]:
# Save only the booster (not sklearn wrapper)
lgb_model.booster_.save_model("lightgbm_booster.txt")

# Load it back (only booster)
booster = lgb.Booster(model_file="lightgbm_booster.txt")

In [38]:
y_pred=booster.predict(X_test)
y_pred = (y_pred >=0.5).astype(int)

In [39]:
print(accuracy_score(y_pred, y_test))

0.8491620111731844


In [None]:
# !pip3 install onnxmltools skl2onnx onnxruntime

In [None]:
# !pip uninstall onnxmltools skl2onnx onnx -y
# !pip install onnxmltools skl2onnx onnx

In [None]:
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType 

# Fit your model
lgb_model.fit(X_train, y_train)

# Define the input type using onnxmltools' FloatTensorType
initial_type = [('input', FloatTensorType([None, X_train.shape[1]]))]

# Convert the model
onnx_model = onnxmltools.convert_lightgbm(lgb_model, initial_types=initial_type)

# Save the ONNX model
with open("lightgbm_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
