In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Load Titanic dataset directly from GitHub
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
train = pd.read_csv(train_url)

# Feature Engineering
def feature_engineering(df):
    df = df.copy()

    # Extract title from Name
    df["Title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
    title_map = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
    df["Title"] = df["Title"].map(lambda x: title_map.get(x, 4))  # Map rare titles together

    # Family size
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # Cabin feature (Extract first letter)
    df["Cabin"] = df["Cabin"].fillna("X").apply(lambda x: x[0])

    # Ticket feature (First character)
    df["TicketPrefix"] = df["Ticket"].apply(lambda x: x[0] if x[0].isalpha() else "X")

    # Fill missing values
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Fare"].fillna(df["Fare"].median(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    # Encode categorical variables
    label_enc = LabelEncoder()
    for col in ["Sex", "Embarked", "Cabin", "TicketPrefix"]:
        df[col] = label_enc.fit_transform(df[col])

    # Drop unnecessary columns
    df.drop(["Name", "Ticket", "PassengerId"], axis=1, inplace=True, errors="ignore")
    
    return df

# Apply feature engineering
train = feature_engineering(train)

# Split into features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=4)
cat_model = CatBoostClassifier(n_estimators=200, learning_rate=0.05, depth=4, verbose=0)

# Define stacking ensemble with Logistic Regression as meta-model
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the ensemble model
print("🚀 Training Stacking Model...")
stacking_model.fit(X_train, y_train)
stack_preds = stacking_model.predict(X_valid)

# Evaluate performance
accuracy = accuracy_score(y_valid, stack_preds)
print(f"✅ Stacking Model Accuracy on Validation Set: {accuracy:.4f}")


🚀 Training Stacking Model...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 225
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 209
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 209
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number o

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Load Titanic dataset from GitHub
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
train = pd.read_csv(train_url)

# Improved Feature Engineering
def feature_engineering(df):
    df = df.copy()

    # Extract title from Name
    df["Title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
    title_map = {
        'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4
    }
    df["Title"] = df["Title"].map(lambda x: title_map.get(x, 4))  # Map rare titles together

    # Family size
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # Create Age Groups
    df["AgeGroup"] = pd.cut(df["Age"], bins=[0, 12, 18, 35, 50, 80], labels=[0, 1, 2, 3, 4])
    df["AgeGroup"] = df["AgeGroup"].cat.codes  # Convert to numeric

    # Cabin feature (Extract first letter)
    df["Cabin"] = df["Cabin"].fillna("X").apply(lambda x: x[0])

    # Ticket feature (First character)
    df["TicketPrefix"] = df["Ticket"].apply(lambda x: x[0] if x[0].isalpha() else "X")

    # Fare binning
    df["FareGroup"] = pd.qcut(df["Fare"], 4, labels=False)

    # Fill missing values
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Fare"].fillna(df["Fare"].median(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    # Encode categorical variables
    label_enc = LabelEncoder()
    for col in ["Sex", "Embarked", "Cabin", "TicketPrefix"]:
        df[col] = label_enc.fit_transform(df[col])

    # Drop unnecessary columns
    df.drop(["Name", "Ticket", "PassengerId", "Age", "Fare"], axis=1, inplace=True, errors="ignore")
    
    return df

# Apply feature engineering
train = feature_engineering(train)

# Split into features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models with better hyperparameters
xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.03, max_depth=5, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.03, max_depth=5)
cat_model = CatBoostClassifier(n_estimators=300, learning_rate=0.03, depth=5, verbose=0)

# Define stacking ensemble with **RandomForest** as meta-model (better than LogisticRegression)
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model)
    ],
    final_estimator=RandomForestClassifier(n_estimators=200, max_depth=5),
    cv=5
)

# Train the ensemble model
print("🚀 Training Stacking Model...")
stacking_model.fit(X_train, y_train)
stack_preds = stacking_model.predict(X_valid)

# Evaluate performance
accuracy = accuracy_score(y_valid, stack_preds)
print(f"✅ Improved Stacking Model Accuracy: {accuracy:.4f}")


🚀 Training Stacking Model...
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 11
[LightGBM] [Info] [binary:BoostFro

In [14]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

# Install missing packages
import os
os.system("pip install optuna imbalanced-learn pytorch-tabnet")

# Load Titanic dataset
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
train = pd.read_csv(train_url)

# Feature Engineering
def feature_engineering(df):
    df = df.copy()
    df["Title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
    title_map = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
    df["Title"] = df["Title"].map(lambda x: title_map.get(x, 4))
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["AgeGroup"] = pd.cut(df["Age"], bins=[0, 12, 18, 35, 50, 80], labels=[0, 1, 2, 3, 4]).cat.codes
    df["Cabin"] = df["Cabin"].fillna("X").apply(lambda x: x[0])
    df["TicketPrefix"] = df["Ticket"].apply(lambda x: x[0] if x[0].isalpha() else "X")
    df["FareGroup"] = pd.qcut(df["Fare"], 4, labels=False)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Fare"].fillna(df["Fare"].median(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
    label_enc = LabelEncoder()
    for col in ["Sex", "Embarked", "Cabin", "TicketPrefix"]:
        df[col] = label_enc.fit_transform(df[col])
    df.drop(["Name", "Ticket", "PassengerId", "Age", "Fare"], axis=1, inplace=True, errors="ignore")
    return df

train = feature_engineering(train)
X = train.drop("Survived", axis=1)
y = train["Survived"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Convert to NumPy (✅ FIX for StackingClassifier)
X_train_np, X_valid_np = X_train.to_numpy(), X_valid.to_numpy()

# Optuna Hyperparameter Tuning for XGBoost
def tune_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_np, y_train)
    preds = model.predict(X_valid_np)
    return accuracy_score(y_valid, preds)

study = optuna.create_study(direction="maximize")
study.optimize(tune_xgb, n_trials=30)
best_xgb_params = study.best_params

# Define base models
xgb_model = xgb.XGBClassifier(**best_xgb_params, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.03, max_depth=5)
cat_model = CatBoostClassifier(n_estimators=300, learning_rate=0.03, depth=5, verbose=0)

# TabNet Model (✅ Fixed NumPy conversion)
tabnet_model = TabNetClassifier(verbose=0)
tabnet_model.fit(X_train_np, y_train.to_numpy(), max_epochs=200, patience=20)

# Stacking Model
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model),
        ('tabnet', tabnet_model)
    ],
    final_estimator=xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4),
    cv=5
)

# Train the ensemble model
print("🚀 Training Stacking Model...")
stacking_model.fit(X_train_np, y_train)
stack_preds = stacking_model.predict(X_valid_np)

# Evaluate performance
accuracy = accuracy_score(y_valid, stack_preds)
print(f"✅ Optimized Stacking Model Accuracy: {accuracy:.4f}")


[I 2025-03-15 17:31:42,878] A new study created in memory with name: no-name-342d717f-24bf-47fa-84cb-f1b8aac427e4
Parameters: { "use_label_encoder" } are not used.

[I 2025-03-15 17:31:43,215] Trial 0 finished with value: 0.8268156424581006 and parameters: {'n_estimators': 367, 'max_depth': 6, 'learning_rate': 0.07519056372056589, 'subsample': 0.8161495994795948, 'colsample_bytree': 0.865075385296092}. Best is trial 0 with value: 0.8268156424581006.
Parameters: { "use_label_encoder" } are not used.

[I 2025-03-15 17:31:43,411] Trial 1 finished with value: 0.8268156424581006 and parameters: {'n_estimators': 298, 'max_depth': 6, 'learning_rate': 0.03062783871152882, 'subsample': 0.8249322451399396, 'colsample_bytree': 0.6658915937116852}. Best is trial 0 with value: 0.8268156424581006.
Parameters: { "use_label_encoder" } are not used.

[I 2025-03-15 17:31:43,640] Trial 2 finished with value: 0.8212290502793296 and parameters: {'n_estimators': 484, 'max_depth': 6, 'learning_rate': 0.04930

🚀 Training Stacking Model...
[LightGBM] [Info] Number of positive: 444, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 888, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)
Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 355, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 710, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 355, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61
[LightGBM] [Info] Number of data points in the train set: 710, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM]



✅ Optimized Stacking Model Accuracy: 0.7933


