In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


#讀取資料集，顯示前五筆資料測試讀取是否成功
lazy_df=pd.read_csv("Employee.csv")
lazy_df.head()

cat_columns =["Education", "City", "Gender", "EverBenched"]
encoder = OrdinalEncoder()
lazy_df_cat = encoder.fit_transform(lazy_df[cat_columns])
lazy_df[cat_columns] = lazy_df_cat
lazy_df = lazy_df.astype(int) # convert all to int
# lazy_df.info()

#把資料集中的類別變數整合成整數陣列，方便之後能快速存取
X = lazy_df.drop("LeaveOrNot", axis=1)
X.drop("EverBenched",axis=1, inplace=True)
y = lazy_df["LeaveOrNot"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
#決定我們要預測的對象(LeaveOrNot)，並把資料集八二分，留二十趴給測試，其他用來訓練
rf = RandomForestClassifier(
    n_estimators=1400,      
    class_weight='balanced',  
    max_depth=None,           
    random_state=42,
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=7
)

#SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train) # type: ignore

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 定義超參數範圍
param_dist = {
    'n_estimators': randint(100, 5000),
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced']
}

# 初始化隨機搜尋
rf_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, criterion='gini'),
    param_distributions=param_dist,
    n_iter=50,  # 搜尋迭代次數，可依計算資源調整
    cv=5,  # 5 折交叉驗證
    scoring='f1_weighted',  # 以 F1 分數作為評估指標
    n_jobs=-1,  # 使用所有 CPU 核心加速
    random_state=42
)

# 在平衡資料上擬合
rf_search.fit(X_train_balanced, y_train_balanced)

# 取得最佳模型並評估
best_rf = rf_search.best_estimator_
y_test_pred = best_rf.predict(X_test)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
print("Best parameters: {}".format(rf_search.best_params_))
print("Improved Test F1 score: {}".format(test_f1))

In [None]:


# #開始接觸種樹相關的設定了，調整數量、分類點的品管標準等等
rf.fit(X_train, y_train)

#開始訓練，測試看他的準確性長什麼樣子
y_train_pred = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train accuracy:{}".format(train_accuracy))

y_test_pred =rf.predict(X_test)
y_test_pred[:10]
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy:{}".format(test_accuracy)) #進行預測，順道看看他猜的準不準，82趴好像...堪用？

Train accuracy:0.8995633187772926
Test accuracy:0.8590604026845637


In [10]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#懶惰，所以用了package算準確率、精確率、召回率以及最重要的 F1 score，結果如下
print ("Accuracy: {}".format(round(accuracy, 3)))
print ("Precision: {}".format(round(precision, 3)))
print ("Recall: {}".format(round(recall, 3)))
print ("F1 Score: {}".format(round(f1, 3)))

Accuracy: 0.859
Precision: 0.792
Recall: 0.801
F1 Score: 0.796
