安裝必要的庫

In [14]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

載入資料集

In [16]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# 載入訓練與測試資料集
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 顯示資料集的前幾行，了解資料結構
print(train_df.head())
print(test_df.head())

# 特徵工程 - 清理資料
# 填補缺失值
imputer = SimpleImputer(strategy='mean')  # 數值型特徵的缺失值用均值填補
train_df[['Age', 'Fare']] = imputer.fit_transform(train_df[['Age', 'Fare']])
test_df[['Age', 'Fare']] = imputer.transform(test_df[['Age', 'Fare']])

# 對 'Sex' 和 'Embarked' 進行編碼處理
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_df['Embarked'] = test_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# 特徵選擇
X_train = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y_train = train_df['Survived']
X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# 標準化特徵
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 定義 Optuna 優化目標函數
def objective(trial):
    # 超參數範圍
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    # 初始化模型
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)

    # 訓練模型
    model.fit(X_train, y_train)

    # 在訓練集上評估
    y_train_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_train_pred)

    return accuracy

# 使用 Optuna 進行超參數優化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # 進行50次優化

# 顯示最佳超參數
print('Best hyperparameters:', study.best_params)

# 使用最佳超參數訓練最終模型
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# 在測試集上進行預測
y_test_pred = best_model.predict(X_test)

# 將預測結果儲存至 CSV 檔案
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_test_pred})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")


[I 2024-12-14 09:23:04,754] A new study created in memory with name: no-name-c28f38bb-559f-44d8-a771-187084e363d8


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

[I 2024-12-14 09:23:05,212] Trial 0 finished with value: 0.8181818181818182 and parameters: {'n_estimators': 90, 'max_depth': 3, 'min_samples_split': 18, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.8181818181818182.
[I 2024-12-14 09:23:06,137] Trial 1 finished with value: 0.8305274971941639 and parameters: {'n_estimators': 187, 'max_depth': 16, 'min_samples_split': 12, 'min_samples_leaf': 18}. Best is trial 1 with value: 0.8305274971941639.
[I 2024-12-14 09:23:06,672] Trial 2 finished with value: 0.8585858585858586 and parameters: {'n_estimators': 102, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.8585858585858586.
[I 2024-12-14 09:23:06,870] Trial 3 finished with value: 0.8316498316498316 and parameters: {'n_estimators': 26, 'max_depth': 5, 'min_samples_split': 16, 'min_samples_leaf': 16}. Best is trial 2 with value: 0.8585858585858586.
[I 2024-12-14 09:23:06,973] Trial 4 finished with value: 0.8608305274971941 and parameters:

Best hyperparameters: {'n_estimators': 74, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 1}
Predictions saved to submission.csv
