# Titanic: Clean FE + XGBoost Notebook
This notebook is organized **from scratch** to avoid feature mismatch issues (e.g., missing `Name`).
It includes:
- clean data loading (keep raw columns)
- feature engineering transformer (Title, Cabin, Ticket groups, Family, rule-features)
- train/holdout evaluation
- optional XGBoost early stopping
- Kaggle submission export


In [2]:
# --- Imports ---
import numpy as np
import pandas as pd
import os

os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["JOBLIB_TEMP_FOLDER"] = r"D:\joblib_tmp"

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier


In [3]:
# --- Config ---
RANDOM_STATE = 42

TRAIN_PATH = "../data/train.csv"   # change if needed
TEST_PATH  = "../data/test.csv"    # change if needed

TARGET_COL = "Survived"
ID_COL = "PassengerId"


## 1) Load raw data (keep original columns)
Important: we keep `Name`, `Ticket`, `Cabin` etc. Feature engineering needs them.


In [4]:
def load_raw(train_path=TRAIN_PATH, test_path=TEST_PATH):
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    y = train[TARGET_COL].astype(int)
    X = train.drop(columns=[TARGET_COL])
    X_test = test.copy()

    # Safety checks
    assert ID_COL in X.columns and ID_COL in X_test.columns
    return X, y, X_test

df_train, y_train, df_test = load_raw()
print("Train shape:", df_train.shape, "Target:", y_train.shape, "Test shape:", df_test.shape)
print("Columns:", list(df_train.columns))


Train shape: (891, 11) Target: (891,) Test shape: (418, 11)
Columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## 3) Feature Engineering Transformer
Creates:
- TitleGroup
- AgeImputed (Title+Pclass median)
- FamilySize / IsAlone / SmallFamily / LargeFamily
- HasCabin
- TicketGroupSize
- FareLog / FarePerPersonLog / IsFareZero
- Rule-features (IsUpperClassFemale, IsUpperClassBoy)

It is robust to missing columns: if a column isn't present, it falls back safely.


In [8]:
class TitanicFE(BaseEstimator, TransformerMixin):
    def __init__(self, boy_age_cutoff=10, upper_class_max=2):
        self.boy_age_cutoff = boy_age_cutoff
        self.upper_class_max = upper_class_max

    def fit(self, X, y=None):
        X = X.copy()

        # Embarked mode
        if "Embarked" in X.columns:
            mode = X["Embarked"].mode(dropna=True)
            self.embarked_mode_ = mode.iloc[0] if len(mode) else "S"
        else:
            self.embarked_mode_ = "S"

        # Age medians by (Title, Pclass)
        title = self._safe_title_series(X)
        pclass = X["Pclass"] if "Pclass" in X.columns else pd.Series([3]*len(X), index=X.index)
        age = X["Age"] if "Age" in X.columns else pd.Series([np.nan]*len(X), index=X.index)

        tmp = pd.DataFrame({"Title": title, "Pclass": pclass, "Age": age})
        self.age_median_by_title_pclass_ = tmp.groupby(["Title", "Pclass"])["Age"].median()
        self.age_global_median_ = float(tmp["Age"].median()) if tmp["Age"].notna().any() else 30.0

        # Ticket counts (train-only; safe, no target)
        if "Ticket" in X.columns:
            self.ticket_counts_ = X["Ticket"].value_counts()
        else:
            self.ticket_counts_ = pd.Series(dtype=int)

        # Fare median
        if "Fare" in X.columns and X["Fare"].notna().any():
            self.fare_median_ = float(X["Fare"].median())
        else:
            self.fare_median_ = 0.0

        return self

    def transform(self, X):
        X = X.copy()

        # Ensure expected columns exist
        for col, default in [
            ("Pclass", 3),
            ("Sex", "male"),
            ("Age", np.nan),
            ("SibSp", 0),
            ("Parch", 0),
            ("Fare", np.nan),
            ("Embarked", np.nan),
            ("Cabin", np.nan),
            ("Ticket", "UNKNOWN"),
            ("Name", np.nan),
        ]:
            if col not in X.columns:
                X[col] = default

        # Embarked
        X["Embarked"] = X["Embarked"].fillna(self.embarked_mode_)

        # Title & grouped title
        X["Title"] = X["Name"].map(self._extract_title)
        X["TitleGroup"] = X["Title"].map(self._title_group).fillna("Rare")

        # Age imputation (Title+Pclass median)
        X['IsAgeMissing'] = X['Age'].isna().astype(int)
        X["AgeImputed"] = X["Age"]
        mask = X["AgeImputed"].isna()
        if mask.any():
            keys = list(zip(X.loc[mask, "Title"], X.loc[mask, "Pclass"]))
            fills = [self.age_median_by_title_pclass_.get(k, self.age_global_median_) for k in keys]
            X.loc[mask, "AgeImputed"] = fills

        # Family
        X["FamilySize"] = X["SibSp"].fillna(0) + X["Parch"].fillna(0) + 1
        X["IsAlone"] = (X["FamilySize"] == 1).astype(int)
        X["SmallFamily"] = X["FamilySize"].between(2, 4).astype(int)
        X["LargeFamily"] = (X["FamilySize"] >= 5).astype(int)

        # Cabin
        X["HasCabin"] = X["Cabin"].notna().astype(int)

        # Fare
        X["Fare"] = X["Fare"].fillna(self.fare_median_)
        X["IsFareZero"] = (X["Fare"] == 0).astype(int)
        X["FareLog"] = np.log1p(X["Fare"])
        X["FarePerPerson"] = X["Fare"] / X["FamilySize"].clip(lower=1)
        X["FarePerPersonLog"] = np.log1p(X["FarePerPerson"])

        # Ticket group size
        X["TicketGroupSize"] = X["Ticket"].map(self.ticket_counts_).fillna(1).astype(int)
        X["IsGroupTicket"] = (X["TicketGroupSize"] > 1).astype(int)

        # Rule features
        X["IsUpperClass"] = (X["Pclass"] <= self.upper_class_max).astype(int)
        X["IsFemale"] = (X["Sex"] == "female").astype(int)
        X["IsBoy"] = ((X["Sex"] == "male") & (X["AgeImputed"] <= self.boy_age_cutoff)).astype(int)
        X["IsUpperClassFemale"] = (X["IsUpperClass"] & X["IsFemale"]).astype(int)
        X["IsUpperClassBoy"] = (X["IsUpperClass"] & X["IsBoy"]).astype(int)
        X["IsLowerClassStrongFemale"] = ((X["IsUpperClass"] == 0) & (X["IsFemale"] == 1) & (X["SmallFamily"] == 1)).astype(int)

        return X

    def _safe_title_series(self, X):
        if "Name" not in X.columns:
            return pd.Series(["Unknown"] * len(X), index=X.index)
        return X["Name"].map(self._extract_title)

    @staticmethod
    def _extract_title(name):
        if pd.isna(name):
            return "Unknown"
        s = str(name)
        if "," in s and "." in s:
            return s.split(",")[1].split(".")[0].strip()
        return "Unknown"

    @staticmethod
    def _title_group(title):
        if title in ["Mr"]:
            return "Mr"
        if title in ["Mrs", "Mme"]:
            return "Mrs"
        if title in ["Miss", "Mlle", "Ms"]:
            return "Miss"
        if title in ["Master"]:
            return "Master"

        noble_female = {"Lady", "Countess", "Dona"}
        rare_male = {"Dr", "Rev", "Col", "Major", "Capt", "Sir", "Don", "Jonkheer"}

        if title in noble_female:
            return "noble_female"
        if title in rare_male:
            return "Rare_Male"
        return "Rare"


## 4) Preprocessing (OneHot + StandardScaler)
We use FE output columns. This is the most common and robust setup.


In [9]:
NUM_COLS = [
    "AgeImputed",
    "FareLog",
    "FarePerPersonLog",
    "FamilySize",
    "TicketGroupSize",
]

CAT_COLS = [
    "Pclass",
    "Sex",
    "IsAgeMissing",
    "Embarked",
    "TitleGroup",
    "HasCabin",
    "IsAlone",
    "SmallFamily",
    "LargeFamily",
    "IsUpperClassFemale",
    "IsUpperClassBoy",
    "IsFareZero",
    "IsGroupTicket",
    "IsLowerClassStrongFemale"
]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
    ],
    remainder="drop"
)


## 5) Model (XGBoost)
Start with your stable config. We'll later try (depth=3) and/or early stopping.


In [17]:

from sklearn.base import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# ---------------- config ----------------
MCW_GRID = [i for i in range(1, 21)]
N_SPLITS = 5
N_REPEATS = 3
RANDOM_STATE = 42

# data (as you stated)
X_full = df_train.copy()
y_full = y_train.astype(int).to_numpy()

rskf = RepeatedStratifiedKFold(
    n_splits=N_SPLITS,
    n_repeats=N_REPEATS,
    random_state=RANDOM_STATE
)

rows = []


BASE_PARAMS = dict(
    max_depth=2,
    learning_rate=0.01,
    n_estimators=800,
    subsample=0.9,
    colsample_bytree=0.6,
    reg_alpha=0.3,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1
)

# Build full X/y from your new interface
X_full = df_train.copy()
y_full = y_train.astype(int).to_numpy()

assert len(X_full) == len(y_full)
assert ID_COL in X_full.columns and ID_COL in df_test.columns


In [21]:
for mcw in MCW_GRID:
    xgb = XGBClassifier(
        min_child_weight=mcw,
        **BASE_PARAMS
    )

    pipe_template = Pipeline([
        ("fe", TitanicFE(boy_age_cutoff=10, upper_class_max=2)),
        ("prep", preprocess),
        ("model", xgb),
    ])

    oof_proba = np.zeros(len(X_full))
    oof_count = np.zeros(len(X_full), dtype=int)
    fold_acc = []

    for tr_idx, va_idx in rskf.split(X_full, y_full):
        model = clone(pipe_template)

        X_tr, X_va = X_full.iloc[tr_idx], X_full.iloc[va_idx]
        y_tr, y_va = y_full[tr_idx], y_full[va_idx]

        model.fit(X_tr, y_tr)

        va_proba = model.predict_proba(X_va)[:, 1]

        # accuracy at default threshold 0.5
        va_pred_05 = (va_proba >= 0.5).astype(int)
        fold_acc.append(accuracy_score(y_va, va_pred_05))

        oof_proba[va_idx] += va_proba
        oof_count[va_idx] += 1

    # finalize OOF
    oof_proba /= oof_count

    # threshold search on OOF
    ths = np.linspace(0.40, 0.55, 151)
    oof_accs = [
        accuracy_score(y_full, (oof_proba >= t).astype(int))
        for t in ths
    ]
    best_i = int(np.argmax(oof_accs))
    best_thr = float(ths[best_i])
    best_oof_acc = float(oof_accs[best_i])

    rows.append({
        "mcw": mcw,
        "cv_mean@0.5": np.mean(fold_acc),
        "cv_std@0.5":  np.std(fold_acc),
        "oof_best_thr": best_thr,
        "oof_best_acc": best_oof_acc,
        "oof_acc@0.45": accuracy_score(y_full, (oof_proba >= 0.45).astype(int)),
        "oof_acc@0.50": accuracy_score(y_full, (oof_proba >= 0.50).astype(int)),
    })

mcw_table = pd.DataFrame(rows).sort_values("cv_mean@0.5", ascending=False)
mcw_table

Unnamed: 0,mcw,cv_mean@0.5,cv_std@0.5,oof_best_thr,oof_best_acc,oof_acc@0.45,oof_acc@0.50
8,9,0.831277,0.014959,0.406,0.836139,0.832772,0.829405
6,7,0.828646,0.015132,0.419,0.838384,0.833895,0.828283
9,10,0.827542,0.015307,0.408,0.835017,0.829405,0.826038
7,8,0.826788,0.013222,0.4,0.836139,0.832772,0.82716
5,6,0.825276,0.014763,0.421,0.837262,0.835017,0.822671
11,12,0.82492,0.014179,0.433,0.832772,0.83165,0.820426
0,1,0.82491,0.012241,0.438,0.839506,0.835017,0.826038
1,2,0.824907,0.012273,0.435,0.838384,0.837262,0.823793
4,5,0.824158,0.011484,0.418,0.835017,0.832772,0.824916
10,11,0.823798,0.015383,0.443,0.835017,0.83165,0.826038


In [23]:
import numpy as np
import pandas as pd
import os, json
from datetime import datetime

from sklearn.base import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# ----------------------------
# Config
# ----------------------------
BEST_MCW = 8
N_SPLITS = 5
N_REPEATS = 3
RANDOM_STATE = 42

OUT_DIR = "submissions"
os.makedirs(OUT_DIR, exist_ok=True)

# Your data interface (as you described)
X_full = df_train.copy()                      # already dropped TARGET_COL
y_full = y_train.astype(int).to_numpy()
X_test = df_test.copy()

assert len(X_full) == len(y_full)
assert ID_COL in X_full.columns and ID_COL in X_test.columns

# ----------------------------
# Build pipeline template (cloned each fold)
# ----------------------------
xgb = XGBClassifier(
    min_child_weight=BEST_MCW,
    **BASE_PARAMS
)

pipe_template = Pipeline(steps=[
    ("fe", TitanicFE(boy_age_cutoff=10, upper_class_max=2)),
    ("prep", preprocess),
    ("model", xgb),
])

# ----------------------------
# CV loop: OOF + test proba averaging
# ----------------------------
rskf = RepeatedStratifiedKFold(
    n_splits=N_SPLITS,
    n_repeats=N_REPEATS,
    random_state=RANDOM_STATE
)

oof_proba = np.zeros(len(X_full), dtype=float)
oof_count = np.zeros(len(X_full), dtype=int)

test_proba_sum = np.zeros(len(X_test), dtype=float)
fold_acc = []

for fold, (tr_idx, va_idx) in enumerate(rskf.split(X_full, y_full), start=1):
    model = clone(pipe_template)

    X_tr, X_va = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_tr, y_va = y_full[tr_idx], y_full[va_idx]

    model.fit(X_tr, y_tr)

    va_proba = model.predict_proba(X_va)[:, 1]
    va_pred_05 = (va_proba >= 0.5).astype(int)
    fold_acc.append(accuracy_score(y_va, va_pred_05))

    oof_proba[va_idx] += va_proba
    oof_count[va_idx] += 1

    test_proba_sum += model.predict_proba(X_test)[:, 1]

# finalize oof + test
oof_proba /= np.maximum(oof_count, 1)
n_models = N_SPLITS * N_REPEATS
test_proba_avg = test_proba_sum / n_models

print("CV acc mean @0.5:", float(np.mean(fold_acc)))
print("CV acc std  @0.5:", float(np.std(fold_acc)))
print("Models trained:", n_models)

# ----------------------------
# Threshold choice
# Option A (recommended): fixed threshold from your table region
# ----------------------------
THRESHOLD = 0.60

# Option B: best threshold on OOF (slightly optimistic but ok)
# ths = np.linspace(0.35, 0.55, 201)
# accs = [accuracy_score(y_full, (oof_proba >= t).astype(int)) for t in ths]
# THRESHOLD = float(ths[int(np.argmax(accs))])
# print("Chosen OOF-best threshold:", THRESHOLD)

# ----------------------------
# Create submission
# ----------------------------
test_pred = (test_proba_avg >= THRESHOLD).astype(int)

submission = pd.DataFrame({
    ID_COL: X_test[ID_COL].values,
    TARGET_COL: test_pred
})

assert len(submission) == len(X_test)
assert submission[ID_COL].is_unique
assert submission[TARGET_COL].isin([0, 1]).all()

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_tag = f"xgb_fe_ageMissing_mcw{BEST_MCW}_rskf{N_SPLITS}x{N_REPEATS}_thr{THRESHOLD:.3f}"

csv_path = os.path.join(OUT_DIR, f"submission_{run_tag}_{timestamp}.csv")
meta_path = os.path.join(OUT_DIR, f"meta_{run_tag}_{timestamp}.json")

submission.to_csv(csv_path, index=False)

meta = {
    "timestamp": timestamp,
    "run_tag": run_tag,
    "mcw": BEST_MCW,
    "threshold": THRESHOLD,
    "cv_mean_at_0p5": float(np.mean(fold_acc)),
    "cv_std_at_0p5": float(np.std(fold_acc)),
    "base_params": BASE_PARAMS,
    "n_splits": N_SPLITS,
    "n_repeats": N_REPEATS,
    "n_models": n_models
}
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved submission:", csv_path)
print("Saved meta:", meta_path)
submission.head()


CV acc mean @0.5: 0.8267884418220242
CV acc std  @0.5: 0.013222072501671003
Models trained: 15
Saved submission: submissions\submission_xgb_fe_ageMissing_mcw8_rskf5x3_thr0.600_20251226_153330.csv
Saved meta: submissions\meta_xgb_fe_ageMissing_mcw8_rskf5x3_thr0.600_20251226_153330.json


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
