In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]
X_test = test_df.copy()


In [3]:
numeric_features = ["Age", "Fare", "SibSp", "Parch"]
categorical_features = ["Sex", "Pclass", "Embarked"]

In [4]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

In [5]:
MODEL_ZOO = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=6, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True)
}

In [6]:
def auto_select_best_model():
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    best_model = None
    best_acc = 0

    for name, model in MODEL_ZOO.items():
        pipeline = Pipeline([
            ("preprocessing", preprocessor),
            ("model", model)
        ])

        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_val)

        acc = accuracy_score(y_val, preds)

        print(f"{name} Accuracy: {acc:.4f} ({acc*100:.2f}%)")

        if acc > best_acc:
            best_acc = acc
            best_model = pipeline

    print("\nBest Model Selected")
    print(f"Accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")

    return best_model


In [7]:
best_pipeline = auto_select_best_model()
best_pipeline.fit(X, y)


LogisticRegression Accuracy: 0.8045 (80.45%)
RandomForest Accuracy: 0.7989 (79.89%)
GradientBoosting Accuracy: 0.7933 (79.33%)
SVM Accuracy: 0.8156 (81.56%)

Best Model Selected
Accuracy: 0.8156 (81.56%)


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [8]:
preds = best_pipeline.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": preds.astype(int)
})

submission.to_csv("submission_auto_best.csv", index=False)

print("Saved submission_auto_best.csv")


Saved submission_auto_best.csv
