In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

train_data = pd.read_csv("data/train.csv")
train_data = train_data.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_data = pd.read_csv("data/test.csv")
test_data = test_data.drop(['Name', 'Cabin', 'Ticket'], axis=1)

con = train_data['Sex'] == 'female'
con2 = train_data['Sex'] == 'male'

train_data.loc[con, 'Age'] = train_data.loc[con, 'Age'].fillna(train_data.loc[con, 'Age'].mean())
train_data.loc[con2, 'Age'] = train_data.loc[con2, 'Age'].fillna(train_data.loc[con2, 'Age'].mean())
train_data['Embarked'] = train_data['Embarked'].fillna('S')

test_data.loc[con, 'Age'] = test_data.loc[con, 'Age'].fillna(test_data.loc[con, 'Age'].mean())
test_data.loc[con2, 'Age'] = test_data.loc[con2, 'Age'].fillna(test_data.loc[con2, 'Age'].mean())
test_data['Embarked'] = test_data['Embarked'].fillna('S')

train_target = train_data["Survived"]
features = ['Age', 'Fare', 'Pclass', 'Sex', 'SibSp']

train_input = pd.get_dummies(train_data[features])
test_input = pd.get_dummies(test_data[features])

# 하이퍼파라미터 튜닝 (Grid Search)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'random_state': [0, 20, 42]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_input, train_target)

# 최적의 하이퍼파라미터 출력
print("최적의 하이퍼파라미터 조합:", grid_search.best_params_)
print("최적의:", grid_search.best_score_)
# 최적의 하이퍼파라미터로 모델 재학습
best_params = grid_search.best_params_
model = RandomForestClassifier(**best_params)
model.fit(train_input, train_target)
predictions = model.predict(test_input)

# scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=1)
# print(scores)
# print(model.oob_score_)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

최적의 하이퍼파라미터 조합: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}
최적의: 0.8372983491306257
Your submission was successfully saved!
