In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [2]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print("Train:", train_df.shape)
print("Test:", test_df.shape)


Train: (891, 12)
Test: (418, 11)


In [3]:
def feature_engineering(df):
    df = df.copy()

    # Family features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    # Title extraction
    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace({
        "Lady": "Rare", "Countess": "Rare", "Capt": "Rare",
        "Col": "Rare", "Don": "Rare", "Dr": "Rare",
        "Major": "Rare", "Rev": "Rare", "Sir": "Rare",
        "Jonkheer": "Rare", "Dona": "Rare",
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs"
    })

    # Age buckets
    df["AgeBucket"] = pd.cut(
        df["Age"],
        bins=[0, 12, 18, 35, 60, 80],
        labels=["Child", "Teen", "YoungAdult", "Adult", "Senior"]
    )

    # Fare normalization
    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]

    # Deck
    df["Deck"] = df["Cabin"].str[0].fillna("Unknown")

    # Drop noisy columns
    df.drop(columns=["Ticket", "Name", "Cabin"], inplace=True)

    return df


In [4]:
train_df = feature_engineering(train_df)
test_df  = feature_engineering(test_df)

X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]
X_test = test_df.copy()


In [5]:
numeric_features = [
    "Age", "Fare", "SibSp", "Parch",
    "FamilySize", "IsAlone", "FarePerPerson"
]

categorical_features = [
    "Sex", "Pclass", "Embarked",
    "Title", "AgeBucket", "Deck"
]


In [6]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [7]:
MODEL_ZOO = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        max_depth=6,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True)
}


In [8]:
def auto_select_best_model():
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    best_pipeline = None
    best_acc = 0

    for name, model in MODEL_ZOO.items():
        pipeline = Pipeline([
            ("preprocessing", preprocessor),
            ("model", model)
        ])

        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_val)

        acc = accuracy_score(y_val, preds)

        print(f"{name}")
        print(f"Accuracy: {acc:.4f}")
        print(f"Accuracy (%): {acc*100:.2f}%\n")

        if acc > best_acc:
            best_acc = acc
            best_pipeline = pipeline

    print("Best Model Selected")
    print(f"Accuracy: {best_acc:.4f}")
    print(f"Accuracy (%): {best_acc*100:.2f}%")

    return best_pipeline


In [9]:
best_pipeline = auto_select_best_model()
best_pipeline.fit(X, y)


LogisticRegression
Accuracy: 0.8436
Accuracy (%): 84.36%

RandomForest
Accuracy: 0.8268
Accuracy (%): 82.68%

GradientBoosting
Accuracy: 0.8324
Accuracy (%): 83.24%

SVM
Accuracy: 0.8380
Accuracy (%): 83.80%

Best Model Selected
Accuracy: 0.8436
Accuracy (%): 84.36%


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
test_predictions = best_pipeline.predict(X_test)
print("Predictions:", len(test_predictions))

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_predictions.astype(int)
})

submission.to_csv("submission_feature_engg.csv", index=False)

submission.head()


Predictions: 418


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
