# Loading and Preprocessing Titanic Dataset

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Titanic dataset from seaborn
import seaborn as sns
df = sns.load_dataset("titanic")

# Drop rows with missing target
df = df.dropna(subset=['survived'])

# Select features and target
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features]
y = df['survived']

# Handle missing values simply
X['age'].fillna(X['age'].median(), inplace=True)
X['embarked'].fillna('S', inplace=True)  # most common

# Identify categorical columns
cat_features = ['pclass', 'sex', 'embarked']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['age'].fillna(X['age'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'].fillna(X['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(val

In [6]:
#X['sex'] = X['sex'].replace({'male':1, 'female':0})
X = pd.get_dummies(X, columns=['embarked'], dtype='int')
X = pd.get_dummies(X, columns=['sex'], dtype='int')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X

Unnamed: 0,pclass,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,3,22.0,1,0,7.2500,0,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,26.0,0,0,7.9250,0,0,1,1,0
3,1,35.0,1,0,53.1000,0,0,1,1,0
4,3,35.0,0,0,8.0500,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,1,0,1
887,1,19.0,0,0,30.0000,0,0,1,1,0
888,3,28.0,1,2,23.4500,0,0,1,1,0
889,1,26.0,0,0,30.0000,1,0,0,0,1


# I. Individual Model Tuning

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report

### 1. XG Boost

In [9]:
from xgboost import XGBClassifier

In [10]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[86 19]
 [18 56]]
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       105
           1       0.75      0.76      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179



In [11]:
def hypertune(model, param):
        cv = KFold(n_splits=5)
        grid_search = GridSearchCV(model, param, cv=cv, scoring = ('accuracy'), return_train_score = True, 
                             verbose = 2)
        
        start = time.time()
        %time grid_search.fit(X_train, y_train)
        stop = time.time()
        sec = stop-start
        minu = sec/60
        hrs = minu/60
        print("\n")
        print('Total time required for execution:', hrs)
        print("\n")
        print('best parameters are:', grid_search.best_params_)
        print('CV accuracy:', grid_search.best_score_)
        print("\n")
        print("The cross validation results are:", grid_search.cv_results_)

In [12]:
model1 = XGBClassifier(
                      learning_rate = 0.1,
                      n_estimators = 100,
                      max_depth= 12,
                      min_child_weight = 1,
                      gamma = 0,
                      subsample =1, 
                      colsample_bytree = 1,
                      reg_alpha = 1.,
                      reg_lambda=1,
                      n_jobs = -1
                      )

param1 = {'learning_rate':[0.09, 0.094, 0.096], 'n_estimators': [95, 96,97]}

In [13]:
hypertune(model1, param1)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ................learning_rate=0.09, n_estimators=95; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=95; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=95; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=95; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=95; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=96; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=96; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=96; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=96; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=96; total time=   0.2s
[CV] END ................learning_rate=0.09, n_estimators=97; total time=   0.2s
[CV] END ................learning_rate=0.09, n_es

### 2. LightGBM

In [14]:
import lightgbm as lgb

In [15]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[[89 16]
 [16 58]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       105
           1       0.78      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.82      0.82      0.82       179



In [16]:
model2 = lgb.LGBMClassifier(learning_rate = 0.1, n_estimators = 100,  max_depth= 12, 
                                        min_child_weight = 1, gamma = 0, subsample =1, 
                              colsample_bytree = 1, reg_alpha = 1., reg_lambda=1,
                            n_jobs = -1)
param2 = {'learning_rate':[0.05, 0.09, 0.1], 'n_estimators': range(94, 102, 2)}

hypertune(model2, param2)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 212, number of negative: 357
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372583 -> initscore=-0.521150
[LightGBM] [Info] Start training from score -0.521150
[CV] END ................learning_rate=0.05, n_estimators=94; total time=   0.2s
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

# II. Multiple Model Tuning

In [17]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# Define models and hyperparameters
boosting_models = {
    'adaboost': (AdaBoostClassifier(random_state=42), {
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.5, 1.0],
    }),
    
    'gb': (GradientBoostingClassifier(random_state=42), {
        'model__n_estimators': [100, 150, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [9, 10, 12],
    }),

    'xgb': (XGBClassifier(eval_metric='logloss', random_state=42), {
        'model__n_estimators': [95, 97, 100],
        'model__learning_rate': [0.085,0.09,0.095],
        'model__max_depth': [8, 9, 10],
        'model__reg_lambda': [0.85,0.95,1],
        'model__colsample_bytree':[0.8, 0.85, 0.9],
        'model__subsample':[0.8, 0.85, 0.9,1],
    }),

    'lgbm': (lgb.LGBMClassifier(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5],
    }),

}

In [None]:
for name, (model, params) in boosting_models.items():
    print(f"\n🔍 Running GridSearchCV for: {name}")
    
    # Define pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Grid Search
    grid = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)

    # Output
    print(f"✅ Best Parameters for {name}: {grid.best_params_}")
    print(f"📈 Best CV Accuracy for {name}: {grid.best_score_:.4f}")
    print(f"🧪 Test Accuracy: {grid.score(X_test, y_test):.4f}")


🔍 Running GridSearchCV for: adaboost
✅ Best Parameters for adaboost: {'model__learning_rate': 1.0, 'model__n_estimators': 100}
📈 Best CV Accuracy for adaboost: 0.8033
🧪 Test Accuracy: 0.7933

🔍 Running GridSearchCV for: gb
✅ Best Parameters for gb: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 200}
📈 Best CV Accuracy for gb: 0.7866
🧪 Test Accuracy: 0.8045

🔍 Running GridSearchCV for: xgb
✅ Best Parameters for xgb: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.09, 'model__max_depth': 8, 'model__n_estimators': 97, 'model__reg_lambda': 1, 'model__subsample': 0.8}
📈 Best CV Accuracy for xgb: 0.8287
🧪 Test Accuracy: 0.8156

🔍 Running GridSearchCV for: lgbm
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002468



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377193 -> initscore=-0.501480
[LightGBM] [Info] Start training from score -0.501480

[LightGBM] [Info] Number of positive: 214, number of negative: 356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.375439 -> initscore=-0.508955
[LightGBM] [Info] 



[LightGBM] [Info] Number of positive: 215, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377193 -> initscore=-0.501480
[LightGBM] [Info] Start training from score -0.501480
[LightGBM] [Info] Number of positive: 214, number of negative: 356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 10
[LightGBM] [Info] [binary:BoostF



[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info





[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142




[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 215, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 10
[LightGBM] [Info] [binary:BoostF



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] S




















[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
✅ Best Parameters for lgbm: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 200}
📈 Best CV Accuracy for lgbm: 0.8413
🧪 Test Accuracy: 0.8045




In [20]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # Load Titanic dataset from seaborn
# import seaborn as sns
# df = sns.load_dataset("titanic")

# # Drop rows with missing target
# df = df.dropna(subset=['survived'])

# # Select features and target
# features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
# df= df[features]



# df['age'].fillna(df['age'].median(), inplace=True)
# df['embarked'].fillna('S', inplace=True)  # most common

# df.isna().sum()


# #X['sex'] = X['sex'].replace({'male':1, 'female':0})
# df = pd.get_dummies(df, columns=['embarked'], dtype='int')
# df = pd.get_dummies(df, columns=['sex'], dtype='int')

# df

#df.to_csv('titanic.csv', index=False)
