In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [2]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")


In [3]:
def feature_engineering(df):
    df = df.copy()

    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace({
        "Lady": "Rare", "Countess": "Rare", "Capt": "Rare",
        "Col": "Rare", "Don": "Rare", "Dr": "Rare",
        "Major": "Rare", "Rev": "Rare", "Sir": "Rare",
        "Jonkheer": "Rare", "Dona": "Rare",
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs"
    })

    df.drop(columns=["Ticket", "Name", "Cabin"], inplace=True)
    return df


In [4]:
train_df = feature_engineering(train_df)
test_df  = feature_engineering(test_df)

X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]
X_test = test_df.copy()


In [5]:
numeric_features = [
    "Age", "Fare", "SibSp", "Parch", "FamilySize", "IsAlone"
]

categorical_features = [
    "Sex", "Pclass", "Embarked", "Title"
]

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), numeric_features),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical_features)
])


In [6]:
models = [
    ("lr", LogisticRegression(max_iter=1000), 1),
    ("rf", RandomForestClassifier(n_estimators=500, max_depth=6, random_state=42), 3),
    ("gb", GradientBoostingClassifier(n_estimators=300, learning_rate=0.05), 3),
    ("svm", SVC(probability=True), 2)
]


In [7]:
def ensemble_kfold_predict(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    test_preds = np.zeros(len(X_test))

    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

        fold_preds = np.zeros(len(X_test))
        total_weight = 0

        for name, model, weight in models:
            pipeline = Pipeline([
                ("prep", preprocessor),
                ("model", model)
            ])

            pipeline.fit(X_train, y_train)
            probs = pipeline.predict_proba(X_test)[:, 1]

            fold_preds += weight * probs
            total_weight += weight

        test_preds += fold_preds / total_weight

    return (test_preds / n_splits >= 0.5).astype(int)


In [8]:
final_preds = ensemble_kfold_predict(X, y, X_test)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": final_preds
})

submission.to_csv("submission_ensmbl_feature_engg.csv", index=False)
