# Titanic: Clean FE + XGBoost Notebook
This notebook is organized **from scratch** to avoid feature mismatch issues (e.g., missing `Name`).
It includes:
- clean data loading (keep raw columns)
- feature engineering transformer (Title, Cabin, Ticket groups, Family, rule-features)
- train/holdout evaluation
- optional XGBoost early stopping
- Kaggle submission export


In [38]:
# --- Imports ---
import numpy as np
import pandas as pd
import os

os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["JOBLIB_TEMP_FOLDER"] = r"D:\joblib_tmp"

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier


In [39]:
# --- Config ---
RANDOM_STATE = 2025

TRAIN_PATH = "../data/train.csv"   # change if needed
TEST_PATH  = "../data/test.csv"    # change if needed

TARGET_COL = "Survived"
ID_COL = "PassengerId"


## 1) Load raw data (keep original columns)
Important: we keep `Name`, `Ticket`, `Cabin` etc. Feature engineering needs them.


In [40]:
def load_raw(train_path=TRAIN_PATH, test_path=TEST_PATH):
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    y = train[TARGET_COL].astype(int)
    X = train.drop(columns=[TARGET_COL])
    X_test = test.copy()

    # Safety checks
    assert ID_COL in X.columns and ID_COL in X_test.columns
    return X, y, X_test

X, y, X_test = load_raw()
print("Train shape:", X.shape, "Target:", y.shape, "Test shape:", X_test.shape)
print("Columns:", list(X.columns))


Train shape: (891, 11) Target: (891,) Test shape: (418, 11)
Columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## 2) Split train/holdout
Holdout is used as a reality check (prevents CV overfitting during feature iteration).


In [41]:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print("Train split:", X_train.shape, "Holdout:", X_holdout.shape)


Train split: (712, 11) Holdout: (179, 11)


In [42]:
X = X_train.copy()
X["Ticket"].value_counts()

Ticket
CA. 2343              7
382652                5
347082                5
CA 2144               5
W./C. 6608            4
                     ..
PC 17474              1
A/5. 13032            1
370365                1
SOTON/O.Q. 3101307    1
244358                1
Name: count, Length: 563, dtype: int64

## 3) Feature Engineering Transformer
Creates:
- TitleGroup
- AgeImputed (Title+Pclass median)
- FamilySize / IsAlone / SmallFamily / LargeFamily
- HasCabin
- TicketGroupSize
- FareLog / FarePerPersonLog / IsFareZero
- Rule-features (IsUpperClassFemale, IsUpperClassBoy)

It is robust to missing columns: if a column isn't present, it falls back safely.


In [46]:
class TitanicFE(BaseEstimator, TransformerMixin):
    def __init__(self, boy_age_cutoff=10, upper_class_max=2):
        self.boy_age_cutoff = boy_age_cutoff
        self.upper_class_max = upper_class_max

    def fit(self, X, y=None):
        X = X.copy()

        # Embarked mode
        if "Embarked" in X.columns:
            mode = X["Embarked"].mode(dropna=True)
            self.embarked_mode_ = mode.iloc[0] if len(mode) else "S"
        else:
            self.embarked_mode_ = "S"

        # Age medians by (Title, Pclass)
        title = self._safe_title_series(X)
        pclass = X["Pclass"] if "Pclass" in X.columns else pd.Series([3]*len(X), index=X.index)
        age = X["Age"] if "Age" in X.columns else pd.Series([np.nan]*len(X), index=X.index)

        tmp = pd.DataFrame({"Title": title, "Pclass": pclass, "Age": age})
        self.age_median_by_title_pclass_ = tmp.groupby(["Title", "Pclass"])["Age"].median()
        self.age_global_median_ = float(tmp["Age"].median()) if tmp["Age"].notna().any() else 30.0

        # Ticket counts (train-only; safe, no target)
        if "Ticket" in X.columns:
            self.ticket_counts_ = X["Ticket"].value_counts()
        else:
            self.ticket_counts_ = pd.Series(dtype=int)

        # Fare median
        if "Fare" in X.columns and X["Fare"].notna().any():
            self.fare_median_ = float(X["Fare"].median())
        else:
            self.fare_median_ = 0.0

        return self

    def transform(self, X):
        X = X.copy()

        # Ensure expected columns exist
        for col, default in [
            ("Pclass", 3),
            ("Sex", "male"),
            ("Age", np.nan),
            ("SibSp", 0),
            ("Parch", 0),
            ("Fare", np.nan),
            ("Embarked", np.nan),
            ("Cabin", np.nan),
            ("Ticket", "UNKNOWN"),
            ("Name", np.nan),
        ]:
            if col not in X.columns:
                X[col] = default

        # Embarked
        X["Embarked"] = X["Embarked"].fillna(self.embarked_mode_)

        # Title & grouped title
        X["Title"] = X["Name"].map(self._extract_title)
        X["TitleGroup"] = X["Title"].map(self._title_group).fillna("Rare")

        # Age imputation (Title+Pclass median)
        X['AgeIsMissing'] = X['Age'].isna().astype(int)
        X["AgeImputed"] = X["Age"]
        mask = X["AgeImputed"].isna()
        if mask.any():
            keys = list(zip(X.loc[mask, "Title"], X.loc[mask, "Pclass"]))
            fills = [self.age_median_by_title_pclass_.get(k, self.age_global_median_) for k in keys]
            X.loc[mask, "AgeImputed"] = fills

        # Family
        X["FamilySize"] = X["SibSp"].fillna(0) + X["Parch"].fillna(0) + 1
        X["IsAlone"] = (X["FamilySize"] == 1).astype(int)
        X["SmallFamily"] = X["FamilySize"].between(2, 4).astype(int)
        X["LargeFamily"] = (X["FamilySize"] >= 5).astype(int)

        # Cabin
        X["HasCabin"] = X["Cabin"].notna().astype(int)

        # Fare
        X["Fare"] = X["Fare"].fillna(self.fare_median_)
        X["IsFareZero"] = (X["Fare"] == 0).astype(int)
        X["FareLog"] = np.log1p(X["Fare"])
        X["FarePerPerson"] = X["Fare"] / X["FamilySize"].clip(lower=1)
        X["FarePerPersonLog"] = np.log1p(X["FarePerPerson"])

        # Ticket group size
        X["TicketGroupSize"] = X["Ticket"].map(self.ticket_counts_).fillna(1).astype(int)
        X["IsGroupTicket"] = (X["TicketGroupSize"] > 1).astype(int)

        # Rule features
        X["IsUpperClass"] = (X["Pclass"] <= self.upper_class_max).astype(int)
        X["IsFemale"] = (X["Sex"] == "female").astype(int)
        X["IsBoy"] = ((X["Sex"] == "male") & (X["AgeImputed"] <= self.boy_age_cutoff)).astype(int)
        X["IsUpperClassFemale"] = (X["IsUpperClass"] & X["IsFemale"]).astype(int)
        X["IsUpperClassBoy"] = (X["IsUpperClass"] & X["IsBoy"]).astype(int)
        X["IsLowerClassStrongFemale"] = ((X["IsUpperClass"] == 0) & (X["IsFemale"] == 1) & (X["SmallFamily"] == 1)).astype(int)

        return X

    def _safe_title_series(self, X):
        if "Name" not in X.columns:
            return pd.Series(["Unknown"] * len(X), index=X.index)
        return X["Name"].map(self._extract_title)

    @staticmethod
    def _extract_title(name):
        if pd.isna(name):
            return "Unknown"
        s = str(name)
        if "," in s and "." in s:
            return s.split(",")[1].split(".")[0].strip()
        return "Unknown"

    @staticmethod
    def _title_group(title):
        if title in ["Mr"]:
            return "Mr"
        if title in ["Mrs", "Mme"]:
            return "Mrs"
        if title in ["Miss", "Mlle", "Ms"]:
            return "Miss"
        if title in ["Master"]:
            return "Master"

        noble_female = {"Lady", "Countess", "Dona"}
        rare_male = {"Dr", "Rev", "Col", "Major", "Capt", "Sir", "Don", "Jonkheer"}

        if title in noble_female:
            return "noble_female"
        if title in rare_male:
            return "Rare_Male"
        return "Rare"


## 4) Preprocessing (OneHot + StandardScaler)
We use FE output columns. This is the most common and robust setup.


In [47]:
NUM_COLS = [
    "AgeImputed",
    "FareLog",
    "FarePerPersonLog",
    "FamilySize",
    "TicketGroupSize",
]

CAT_COLS = [
    "Pclass",
    "Sex",
    "AgeIsMissing",
    "Embarked",
    "TitleGroup",
    "HasCabin",
    "IsAlone",
    "SmallFamily",
    "LargeFamily",
    "IsUpperClassFemale",
    "IsUpperClassBoy",
    "IsFareZero",
    "IsGroupTicket",
    "IsLowerClassStrongFemale"
]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
    ],
    remainder="drop"
)


## 5) Model (XGBoost)
Start with your stable config. We'll later try (depth=3) and/or early stopping.


In [48]:
BASE_PARAMS = dict(
    max_depth=2,
    learning_rate=0.01,
    n_estimators=800,
    subsample=0.9,
    colsample_bytree=0.6,
    reg_alpha=0.3,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1
)

min_child_weight_grid = [i for i in range(1, 22)]
results = []

for mcw in min_child_weight_grid:
    xgb = XGBClassifier(
        min_child_weight=mcw,
        **BASE_PARAMS,
    )

    pipe = Pipeline(steps=[
        ("fe", TitanicFE(boy_age_cutoff=10, upper_class_max=2)),
        ("prep", preprocess),
        ("model", xgb),
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_holdout)

    holdout_acc = accuracy_score(y_holdout, y_pred)
    train_acc = accuracy_score(y_train, pipe.predict(X_train))
    gap = train_acc - holdout_acc

    results.append({
        "min_child_weight": mcw,
        "holdout_acc": holdout_acc,
        "train_acc": train_acc,
        "gap": gap
    })

    print(f"min_child_weight={mcw}: holdout_acc={holdout_acc:.4f}, train_acc={train_acc:.4f}, gap={gap:.4f}")

min_child_weight=1: holdout_acc=0.7989, train_acc=0.8581, gap=0.0593
min_child_weight=2: holdout_acc=0.7989, train_acc=0.8610, gap=0.0621
min_child_weight=3: holdout_acc=0.7989, train_acc=0.8610, gap=0.0621
min_child_weight=4: holdout_acc=0.7933, train_acc=0.8581, gap=0.0648
min_child_weight=5: holdout_acc=0.7933, train_acc=0.8581, gap=0.0648
min_child_weight=6: holdout_acc=0.7933, train_acc=0.8610, gap=0.0677
min_child_weight=7: holdout_acc=0.7877, train_acc=0.8581, gap=0.0704
min_child_weight=8: holdout_acc=0.7933, train_acc=0.8553, gap=0.0620
min_child_weight=9: holdout_acc=0.8045, train_acc=0.8483, gap=0.0438
min_child_weight=10: holdout_acc=0.7989, train_acc=0.8497, gap=0.0508
min_child_weight=11: holdout_acc=0.7933, train_acc=0.8497, gap=0.0564
min_child_weight=12: holdout_acc=0.8045, train_acc=0.8497, gap=0.0452
min_child_weight=13: holdout_acc=0.8101, train_acc=0.8511, gap=0.0411
min_child_weight=14: holdout_acc=0.8156, train_acc=0.8511, gap=0.0355
min_child_weight=15: holdout_

## 6) Fit + Evaluate (Train vs Holdout)


## 7) Kaggle Submission


In [15]:
# Fit on full training data, predict on test
pipe.fit(X, y)
test_pred = pipe.predict(X_test).astype(int)

submission = pd.DataFrame({
    "PassengerId": X_test[ID_COL],
    "Survived": test_pred
})

# Safety checks
assert len(submission) == len(X_test)
assert submission["PassengerId"].is_unique
assert submission["Survived"].isin([0,1]).all()

submission.to_csv("submission_xgb_fe.csv", index=False)
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
