In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load the dataset
titanic = sns.load_dataset('titanic')

# Drop unnecessary columns
titanic = titanic.drop(['deck', 'class', 'who', 'alive', 'alone', 'embark_town'], axis=1)

# Fill missing values in age with mean age
titanic['age'] = titanic['age'].fillna(titanic['age'].mean())

# Map sex to numerical values
sex_mapping = {'male': 0, 'female': 1}
titanic['sex'] = titanic['sex'].map(sex_mapping)

# Encode embarked using LabelEncoder
le = LabelEncoder()
titanic['embarked'] = le.fit_transform(titanic['embarked'])

# Define X and y
X = titanic.drop(['survived'], axis=1)
y = titanic['survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print best score and parameters
print(f"Best score: {grid_search.best_score_:.3f}")
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate the model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
print(f"Validation accuracy: {accuracy_score(y_val, y_pred):.3f}")


Best score: 0.831
Best parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Validation accuracy: 0.810
