安裝必要的庫

In [None]:
!pip install pycaret



載入資料集

In [None]:
import pandas as pd
from sklearn.datasets import fetch_openml
from pycaret.classification import *

# 下載 Titanic 資料集
titanic = fetch_openml('titanic', version=1, as_frame=True)
df = titanic.frame

# 檢查 'survived' 欄位的唯一值
print(df['survived'].unique())

# 確保 'survived' 欄位只包含 0 和 1
df['survived'] = df['survived'].astype(int)

# 檢查缺失值
print(df.isnull().sum())

# 填補缺失值（這裡選擇用中位數填補數值型資料，並且填補類別型資料的缺失值）
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# 檢查是否還有缺失值
print(df.isnull().sum())

# 顯示資料集的前幾列
df.head()

['1', '0']
Categories (2, object): ['0', '1']
pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64
pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


特徵工程與模型初始化

In [None]:
# 初始化 PyCaret 環境
clf1 = setup(data=df, target='survived', session_id=123,
             normalize=True,
             ignore_features=['name', 'ticket', 'cabin', 'embarked'],
             remove_outliers=True)  # 移除異常值

Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1309, 14)"
4,Transformed data shape,"(1263, 34)"
5,Transformed train set shape,"(870, 34)"
6,Transformed test set shape,"(393, 34)"
7,Ignore features,4
8,Numeric features,6
9,Categorical features,3


模型選擇與比較

In [None]:
# 比較不同的模型並選擇最佳模型
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9454,0.9745,0.8686,0.9875,0.9234,0.8814,0.8866,0.293
ridge,Ridge Classifier,0.9443,0.9745,0.8657,0.9875,0.9216,0.8788,0.8844,0.439
nb,Naive Bayes,0.94,0.9658,0.8543,0.9874,0.9148,0.869,0.8756,0.364
knn,K Neighbors Classifier,0.9389,0.9583,0.8829,0.9543,0.9159,0.8681,0.8711,0.745
lr,Logistic Regression,0.9247,0.9706,0.8771,0.9224,0.8986,0.8388,0.8401,1.781
qda,Quadratic Discriminant Analysis,0.9171,0.9652,0.7943,0.9868,0.8752,0.8154,0.8297,0.298
et,Extra Trees Classifier,0.9137,0.9599,0.8971,0.8834,0.8886,0.8183,0.8205,0.57
rf,Random Forest Classifier,0.9094,0.9666,0.8943,0.8735,0.8829,0.8091,0.8104,0.57
gbc,Gradient Boosting Classifier,0.9083,0.9572,0.8886,0.8753,0.8809,0.8064,0.8078,0.58
lightgbm,Light Gradient Boosting Machine,0.904,0.9562,0.8857,0.8683,0.8757,0.7976,0.7992,0.442


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

超參數優化

In [None]:
# 超參數優化
tuned_model = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9674,0.981,0.9429,0.9706,0.9565,0.9304,0.9307
1,0.9457,0.9825,0.8571,1.0,0.9231,0.8814,0.8877
2,0.9457,0.9865,0.8857,0.9688,0.9254,0.8828,0.885
3,0.9457,0.9719,0.8857,0.9688,0.9254,0.8828,0.885
4,0.9239,0.9283,0.8,1.0,0.8889,0.8321,0.8441
5,0.9457,0.9664,0.8571,1.0,0.9231,0.8814,0.8877
6,0.967,0.9561,0.9143,1.0,0.9552,0.9292,0.9316
7,0.9231,0.9816,0.8286,0.9667,0.8923,0.833,0.839
8,0.9231,0.9867,0.8,1.0,0.8889,0.8312,0.8433
9,0.967,0.9801,0.9143,1.0,0.9552,0.9292,0.9316


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


訓練與評估模型

In [None]:
# 評估最佳模型
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

模型保存與預測

In [None]:
# 儲存模型
save_model(tuned_model, 'tuned_titanic_model')

# 預測新資料
predictions = predict_model(tuned_model, data=df)

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.9458,0.9863,0.876,0.9799,0.925,0.8827,0.8861
