In [8]:
import numpy as np
import pandas as pd
import os

os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["JOBLIB_TEMP_FOLDER"] = r"D:\joblib_tmp"

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from joblib import dump


from xgboost import XGBClassifier

In [9]:
train_path = "../data/train.csv"
test_path = "../data/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [10]:
for df in (train_df, test_df):
    df["Age_is_missing"] = df["Age"].isna().astype(int)
    df["Embarked_is_missing"] = df["Embarked"].isna().astype(int)

feature_cols = [
    "Pclass",
    "Sex",
    "SibSp",
    "Parch",
    "Fare",
    "Age",
    "Embarked",
    "Age_is_missing",
    "Embarked_is_missing",
]

X = train_df[feature_cols].copy()
y = train_df["Survived"].astype(int).copy()
X_test = test_df[feature_cols].copy()

In [11]:
numeric_features = ["Age", "Fare", "SibSp", "Parch"]
categorical_features = [
    "Pclass",
    "Sex",
    "Embarked",
    "Age_is_missing",
    "Embarked_is_missing",
]

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [18]:
RANDOM_STATE = 42

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split

# Split train / holdout ONCE (holdout will be used later)
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE,
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Fixed baseline hyperparameters (except complexity)
BASE_PARAMS = dict(
    n_estimators=500,
    learning_rate=0.02,      # small & safe; LR tuning comes later
    subsample=0.9,
    colsample_bytree=0.6,
    reg_alpha=0.3,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1
)

max_depth_grid = [1, 2, 3, 4]
min_child_weight_grid = [1, 3, 6, 9, 12, 15, 18, 21]

search_results = []

for md in max_depth_grid:
    for mcw in min_child_weight_grid:
        xgb = XGBClassifier(
            max_depth=md,
            min_child_weight=mcw,
            **BASE_PARAMS,
        )

        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("model", xgb),
            ]
        )

        cv_results = cross_validate(
            pipe,
            X_train,
            y_train,
            cv=cv,
            scoring="accuracy",
            return_train_score=True,
            n_jobs=1,
        )

        mean_train = cv_results["train_score"].mean()
        std_train = cv_results["train_score"].std()
        mean_valid = cv_results["test_score"].mean()
        std_valid = cv_results["test_score"].std()

        overfit_gap = mean_train - mean_valid
        alpha_gap = 1.0
        beta_std = 1.0
        custom_score = mean_valid - alpha_gap * overfit_gap - beta_std * std_valid

        search_results.append(
            dict(
                max_depth=md,
                min_child_weight=mcw,
                mean_train=mean_train,
                std_train=std_train,
                mean_valid=mean_valid,
                std_valid=std_valid,
                overfit_gap=overfit_gap,
                custom_score=custom_score,
            )
        )

# Turn into DataFrame and sort
results_df = pd.DataFrame(search_results)

results_df.to_csv("../submissions/xgb_strength.csv")


In [None]:
BASE_PARAMS = dict(
    n_estimators=500,
    max_depth=2,
    subsample=0.9,
    colsample_bytree=0.6,
    reg_alpha=0.3,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,
)


learning_rate = [0.005, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1]
min_child_weight_grid = [6, 12, 18]

search_results = []


for lr in learning_rate:
    for mcw in min_child_weight_grid:
        xgb = XGBClassifier(
            learning_rate=lr,
            min_child_weight=mcw,
            **BASE_PARAMS,
        )

        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("model", xgb),
            ]
        )

        cv_results = cross_validate(
            pipe,
            X_train,
            y_train,
            cv=cv,
            scoring="accuracy",
            return_train_score=True,
            n_jobs=1,
        )

        mean_train = cv_results["train_score"].mean()
        std_train = cv_results["train_score"].std()
        mean_valid = cv_results["test_score"].mean()
        std_valid = cv_results["test_score"].std()

        overfit_gap = mean_train - mean_valid
        alpha_gap = 1.0
        beta_std = 1.0
        custom_score = mean_valid - alpha_gap * overfit_gap - beta_std * std_valid

        search_results.append(
            dict(
                learning_rate=lr,
                min_child_weight=mcw,
                mean_train=mean_train,
                std_train=std_train,
                mean_valid=mean_valid,
                std_valid=std_valid,
                overfit_gap=overfit_gap,
                custom_score=custom_score,
            )
        )

# Turn into DataFrame and sort
results_df = pd.DataFrame(search_results)

results_df.to_csv("../submissions/xgb_cv_acc_lr.csv")