In [1]:
import pyutil as pyu

In [2]:
pyu.get_local_pyinfo()

'conda env: ml312-2024; pyv: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]'

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import optuna
from optuna.pruners import HyperbandPruner

In [4]:
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

In [5]:
# Preprocessing
# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

In [6]:
# Handle missing values
X.loc[:, 'Age'] = X['Age'].fillna(X['Age'].median())
X.loc[:, 'Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

In [7]:
# Convert categorical variables to numerical
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define the objective function for Hyperband optimization
def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    # Create the model with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return np.mean(score)

In [10]:
# Create the Optuna study with Hyperband pruning
study = optuna.create_study(
    direction='maximize',
    pruner=HyperbandPruner()
)
study.optimize(objective, n_trials=50)

[I 2024-12-06 13:07:27,112] A new study created in memory with name: no-name-1849c918-a1d5-4c67-b311-d9cccb6cb56f
[I 2024-12-06 13:07:30,699] Trial 0 finished with value: 0.8117994681374963 and parameters: {'n_estimators': 272, 'max_depth': 13, 'min_samples_split': 7}. Best is trial 0 with value: 0.8117994681374963.
[I 2024-12-06 13:07:33,573] Trial 1 finished with value: 0.8258347286516301 and parameters: {'n_estimators': 259, 'max_depth': 20, 'min_samples_split': 10}. Best is trial 1 with value: 0.8258347286516301.
[I 2024-12-06 13:07:34,649] Trial 2 finished with value: 0.8286220821432089 and parameters: {'n_estimators': 192, 'max_depth': 12, 'min_samples_split': 9}. Best is trial 2 with value: 0.8286220821432089.
[I 2024-12-06 13:07:35,514] Trial 3 finished with value: 0.8328277356446371 and parameters: {'n_estimators': 174, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 3 with value: 0.8328277356446371.
[I 2024-12-06 13:07:36,032] Trial 4 finished with value: 0.82581503004

In [11]:
# Print the best hyperparameters and score
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)

Best Parameters: {'n_estimators': 112, 'max_depth': 7, 'min_samples_split': 2}
Best Score: 0.8328572835615089


In [12]:
# Train the model with the best parameters
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [13]:
# Evaluate on the test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.8100558659217877
