In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [3]:
def fe_basic(df):
    df = df.copy()

    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    df["Embarked"] = df["Embarked"].fillna("S").map({"S": 0, "C": 1, "Q": 2})

    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(
        ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
        "Rare"
    )
    df["Title"] = df["Title"].replace(["Mlle","Ms"],"Miss")
    df["Title"] = df["Title"].replace("Mme","Mrs")

    le = LabelEncoder()
    df["Title"] = le.fit_transform(df["Title"].astype(str))

    df.drop(columns=["Name","Ticket","Cabin"], inplace=True)

    return df


In [4]:
train_fe = fe_basic(train)
test_fe  = fe_basic(test)

X = train_fe.drop(columns=["Survived","PassengerId"])
y = train_fe["Survived"]
X_test = test_fe.drop(columns=["PassengerId"])


In [5]:
models = {
    "rf": RandomForestClassifier(
        n_estimators=700,
        max_depth=5,
        min_samples_leaf=2,
        random_state=42
    ),

    "gb": GradientBoostingClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=3
    ),

    "lr": LogisticRegression(max_iter=1000)
}


In [6]:
def get_oof(model, X, y, X_test):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)

        oof[val_idx] = model.predict_proba(X_val)[:,1]
        test_preds += model.predict_proba(X_test)[:,1] / 5

    return oof, test_preds


In [7]:
oof_preds = []
test_preds = []

for name, model in models.items():
    oof, test_p = get_oof(model, X, y, X_test)
    print(f"{name} OOF accuracy:", accuracy_score(y, (oof >= 0.5).astype(int)))
    oof_preds.append(oof)
    test_preds.append(test_p)

oof_stack = np.vstack(oof_preds).T
test_stack = np.vstack(test_preds).T


rf OOF accuracy: 0.8316498316498316
gb OOF accuracy: 0.8451178451178452
lr OOF accuracy: 0.8047138047138047


In [8]:
meta = LogisticRegression()
meta.fit(oof_stack, y)

meta_oof = meta.predict_proba(oof_stack)[:,1]
print("STACK OOF accuracy:", accuracy_score(y, (meta_oof >= 0.5).astype(int)))


STACK OOF accuracy: 0.8484848484848485


In [9]:
best_t, best_acc = 0.5, 0
for t in np.linspace(0.45, 0.55, 41):
    acc = accuracy_score(y, (meta_oof >= t).astype(int))
    if acc > best_acc:
        best_acc, best_t = acc, t

print("Best threshold:", best_t, "Accuracy:", best_acc)


Best threshold: 0.49750000000000005 Accuracy: 0.8484848484848485


In [10]:
final_test_pred = (meta.predict_proba(test_stack)[:,1] >= best_t).astype(int)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": final_test_pred
})

submission.to_csv("submission_metalearn.csv", index=False)
