In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score

In [16]:
from joblib import dump

In [2]:
train = pd.read_csv('titanic_input/train.csv')

In [3]:
train = train.drop(['PassengerId'], axis=1)

X = train.drop('Survived', axis=1)
y = train['Survived']

In [4]:
class TitanicPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = X.copy()

        # Title
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        self.title_mapping_ = {
            "Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3,
            "Dr": 4, "Rev": 4, "Col": 4, "Major": 4, "Mlle": 4, "Countess": 4,
            "Ms": 4, "Lady": 4, "Jonkheer": 4, "Don": 4, "Dona": 4, "Mme": 4,
            "Capt": 4, "Sir": 4
        }
        df['Title'] = df['Title'].map(self.title_mapping_)

        # Title별 Age 중앙값
        self.age_medians_ = df.groupby("Title")["Age"].median()

        # Pclass별 Fare 중앙값
        self.fare_medians_ = df.groupby("Pclass")["Fare"].median()

        # Pclass별 Cabin 문자형 매핑 후 중앙값
        df['Cabin'] = df['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else np.nan)
        cabin_map = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8}
        df['Cabin'] = df['Cabin'].map(cabin_map)
        self.cabin_medians_ = df.groupby("Pclass")["Cabin"].median()

        return self

    def transform(self, X):
        df = X.copy()

        # Title
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        df['Title'] = df['Title'].map(self.title_mapping_)
        df.drop('Name', axis=1, inplace=True)

        df['Sex'] = df['Sex'].map({"male": 0, "female": 1})

        df["Age"] = df["Age"].copy()
        for title, median in self.age_medians_.items():
            df.loc[(df['Title'] == title) & (df['Age'].isnull()), 'Age'] = median

        df.loc[df['Age'] <= 18, 'Age'] = 0
        df.loc[(df['Age'] > 18) & (df['Age'] <= 27), 'Age'] = 1
        df.loc[(df['Age'] > 27) & (df['Age'] <= 35), 'Age'] = 2
        df.loc[(df['Age'] > 35) & (df['Age'] <= 57), 'Age'] = 3
        df.loc[df['Age'] > 57, 'Age'] = 4

        df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
        df.drop(['SibSp', 'Parch'], axis=1, inplace=True)

        df.drop('Ticket', axis=1, inplace=True)

        df["Fare"] = df["Fare"].copy()
        for pclass, median in self.fare_medians_.items():
            df.loc[(df['Pclass'] == pclass) & (df['Fare'].isnull()), 'Fare'] = median

        df.loc[df['Fare'] <= 29, 'Fare'] = 0
        df.loc[(df['Fare'] > 29) & (df['Fare'] <= 70), 'Fare'] = 1
        df.loc[(df['Fare'] > 70) & (df['Fare'] <= 110), 'Fare'] = 2
        df.loc[df['Fare'] > 110, 'Fare'] = 3

        df['Cabin'] = df['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else np.nan)
        cabin_map = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8}
        df['Cabin'] = df['Cabin'].map(cabin_map)

        for pclass, median in self.cabin_medians_.items():
            df.loc[(df['Pclass'] == pclass) & (df['Cabin'].isnull()), 'Cabin'] = median

        df.drop('Embarked', axis=1, inplace=True)

        return df


In [5]:
pipeline = Pipeline(steps=[
    ("preprocess", TitanicPreprocessor()),
    ("model", RF(random_state=42))
])

In [6]:
param_grid = {
    'model__n_estimators': [100, 150],
    'model__max_depth': [None, 10],
    'model__min_samples_split': [5, 6],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt', 'log2']
}


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
k_fold = KFold(n_splits=11, shuffle=True, random_state=0)
scoring = 'accuracy'

In [9]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, scoring=scoring, n_jobs=-1)

In [10]:
grid_search.fit(X_train, y_train)

In [11]:
print("✅ Best parameters:", grid_search.best_params_)

✅ Best parameters: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 100}


In [12]:
print("✅ Best score:", grid_search.best_score_)

✅ Best score: 0.8244318181818181


In [13]:
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test)
print("Final Accuracy:", accuracy_score(y_test, y_pred_best)*100)

Final Accuracy: 86.59217877094973


In [18]:
dump(grid_search.best_estimator_, "titanic_pipeline_grid_best.joblib")

['titanic_pipeline_grid_best.joblib']