In [3]:
import pyutil as pyu

In [4]:
pyu.get_local_pyinfo()

'conda env: ml312-2024; pyv: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]'

In [3]:
# !pip install optuna

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import optuna

In [5]:
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

In [6]:
# Preprocessing
# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

In [7]:
# Handle missing values
X.loc[:, 'Age'] = X['Age'].fillna(X['Age'].median())
X.loc[:, 'Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

In [8]:
# Convert categorical variables to numerical
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define the objective function for Bayesian Optimization
def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    # Create the model with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    # Perform 5-fold cross-validation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    
    # Return the mean accuracy
    return score.mean()

In [11]:
# Create the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-12-06 12:22:46,877] A new study created in memory with name: no-name-8d2628a4-bf66-4273-8f97-99dfa629bd7a
[I 2024-12-06 12:22:48,446] Trial 0 finished with value: 0.8229685807150595 and parameters: {'n_estimators': 253, 'max_depth': 9, 'min_samples_split': 10}. Best is trial 0 with value: 0.8229685807150595.
[I 2024-12-06 12:22:49,184] Trial 1 finished with value: 0.8243967300305328 and parameters: {'n_estimators': 166, 'max_depth': 7, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:50,066] Trial 2 finished with value: 0.8104205653501427 and parameters: {'n_estimators': 247, 'max_depth': 16, 'min_samples_split': 3}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:52,141] Trial 3 finished with value: 0.8187826258248794 and parameters: {'n_estimators': 363, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:54,509] Trial 4 finished with value: 0.8160149709445

In [12]:
# Print the best hyperparameters and score
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)

Best Parameters: {'n_estimators': 182, 'max_depth': 6, 'min_samples_split': 9}
Best Score: 0.8342361863488623


In [13]:
# Train the model with the best parameters
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [14]:
# Evaluate on the test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.8156424581005587
