# 🛠️ Titanic Survival Prediction - Data Assessment & Cleaning

This notebook is part of the Titanic Survival Prediction project. In this phase, we focus on building model using different alogorithms.

[MLflow server](https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow)

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, RandomForestClassifier
)
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv("/kaggle/input/titanic-features/titanic_features.csv")
X, y = df.drop(columns=["Survived"]), df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)

In [3]:
!pip install mlflow==2.15.0

Collecting mlflow==2.15.0
  Downloading mlflow-2.15.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.0 (from mlflow==2.15.0)
  Downloading mlflow_skinny-2.15.0-py3-none-any.whl.metadata (30 kB)
Collecting graphene<4 (from mlflow==2.15.0)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting pyarrow<16,>=4.0.0 (from mlflow==2.15.0)
  Downloading pyarrow-15.0.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting querystring-parser<2 (from mlflow==2.15.0)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow==2.15.0)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.15.0->mlflow==2.15.0)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting importlib-metadata!=4.7.0,<8,>=3.7.0 (from mlflow-skinny==2.15.0->mlflow==2.15.0)
  Downloading importlib_metadata-7

In [4]:
!pip install dagshub==0.3.34

Collecting dagshub==0.3.34
  Downloading dagshub-0.3.34-py3-none-any.whl.metadata (11 kB)
Collecting fusepy>=3 (from dagshub==0.3.34)
  Downloading fusepy-3.0.1.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting appdirs>=1.4.4 (from dagshub==0.3.34)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting httpx~=0.23.0 (from dagshub==0.3.34)
  Downloading httpx-0.23.3-py3-none-any.whl.metadata (7.1 kB)
Collecting rich~=13.1.0 (from dagshub==0.3.34)
  Downloading rich-13.1.0-py3-none-any.whl.metadata (18 kB)
Collecting dacite~=1.6.0 (from dagshub==0.3.34)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting tenacity~=8.2.2 (from dagshub==0.3.34)
  Downloading tenacity-8.2.3-py3-none-any.whl.metadata (1.0 kB)
Collecting gql[requests] (from dagshub==0.3.34)
  Downloading gql-3.5.3-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting treelib~=1.6.4 (from dagshub==0.3.34)
  Downloading treelib-1.6.4-py3-none-

In [5]:
import mlflow
import mlflow.sklearn
import dagshub
import os

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
dagshub_token = user_secrets.get_secret("DAGSHUB_PAT")

if not dagshub_token:
    raise EnvironmentError("DAGSHUB_PAT environment variable is not set")

os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

dagshub_url = "https://dagshub.com"
repo_owner = "pxxthik"
repo_name = "Titanic-Survival-Prediction"

# Set up MLflow tracking URI
mlflow.set_tracking_uri(f'{dagshub_url}/{repo_owner}/{repo_name}.mlflow')

In [6]:
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),
    "SGD Classifier": SGDClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [7]:
mlflow.set_experiment("Model Building")

<Experiment: artifact_location='mlflow-artifacts:/daa7d6607ed54c6dbfaf278212c308e4', creation_time=1750267522668, experiment_id='2', last_update_time=1750267522668, lifecycle_stage='active', name='Model Building', tags={}>

In [8]:
import uuid

In [9]:
# Train and log each model
for model_name, model in models.items():

    random_id = uuid.uuid4().hex[:6]
    run_name = f"{model_name}_{random_id}"
    
    with mlflow.start_run(run_name=run_name):
        # Train
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)

        # Logging
        mlflow.log_param("model_name", model_name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "roc_auc": roc_auc
        })

        # Log the model itself
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"Logged {model_name} to MLflow with Accuracy: {acc:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025/06/18 17:47:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_f578c9 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/c697bc49df4e4d19a1254f0d6ce5f83d.
2025/06/18 17:47:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged Logistic Regression to MLflow with Accuracy: 0.7933


2025/06/18 17:48:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_301a63 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/7d6630f4b8d14d608346f9fdaa98bcd3.
2025/06/18 17:48:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged KNN to MLflow with Accuracy: 0.7318




Logged Decision Tree to MLflow with Accuracy: 0.7877


2025/06/18 17:48:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree_23fd0a at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/61711ae18e7348759c6b3a3f80fc9433.
2025/06/18 17:48:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged Random Forest to MLflow with Accuracy: 0.8380


2025/06/18 17:48:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_e133c3 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/8db8a9462d174b419db9e29bfa5e3f12.
2025/06/18 17:48:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged SVM to MLflow with Accuracy: 0.6592


2025/06/18 17:48:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVM_3a8e90 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/590ef048d6b34d6aaf1fb72034d89525.
2025/06/18 17:48:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged GaussianNB to MLflow with Accuracy: 0.7709


2025/06/18 17:48:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run GaussianNB_5d8136 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/2f99119202324b35b270bc889f8127cf.
2025/06/18 17:48:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged BernoulliNB to MLflow with Accuracy: 0.7765


2025/06/18 17:49:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run BernoulliNB_412f67 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/288f906c2bea4ff7865d075fc89f7327.
2025/06/18 17:49:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged SGD Classifier to MLflow with Accuracy: 0.8045


2025/06/18 17:49:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run SGD Classifier_0624cd at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/fff02fdcfd6c4917ab9b07ab65efdbfb.
2025/06/18 17:49:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged Gradient Boosting to MLflow with Accuracy: 0.8101


2025/06/18 17:49:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run Gradient Boosting_f6b8b3 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/acda1b85e12b4f62a7eb7c475c1e46b8.
2025/06/18 17:49:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged AdaBoost to MLflow with Accuracy: 0.8101


2025/06/18 17:49:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run AdaBoost_32fd9b at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/5fc88457d976405ab694ab24677114b1.
2025/06/18 17:49:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged Bagging to MLflow with Accuracy: 0.8380


2025/06/18 17:49:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run Bagging_f28918 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/fadf3f85a6c4486d97a39a30047ab1d2.
2025/06/18 17:49:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged Extra Trees to MLflow with Accuracy: 0.8324


2025/06/18 17:50:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run Extra Trees_69252f at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/4eb251742f924659bad144b6c3bed81f.
2025/06/18 17:50:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


Logged XGBoost to MLflow with Accuracy: 0.8156


2025/06/18 17:50:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost_9fd052 at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2/runs/cb3e462a1861420397ec677b01a6d633.
2025/06/18 17:50:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/pxxthik/Titanic-Survival-Prediction.mlflow/#/experiments/2.


In [10]:
# Extra trees and Random Forest are performing better