In [1]:
# 匯入套件

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# 讀入資料檔案，觀察資料

data = pd.read_csv('Train_Data.csv')

print('data size: ', data.shape)
print('columns: ', data.columns)
data

data size:  (470, 24)
columns:  Index(['編號', '專案', '日期', '月', '日', '星期', '是否假日', '時長', '機位數量', '工作性質', '花絮',
       '視訊切換', '視訊連線', 'PA音控', '大場分小場', '導播人數', '攝影人數', '音控人數', '直播人數', '機動人數',
       '花絮人數', '視訊切換人數', '視訊連線人數', '人數'],
      dtype='object')


Unnamed: 0,編號,專案,日期,月,日,星期,是否假日,時長,機位數量,工作性質,...,大場分小場,導播人數,攝影人數,音控人數,直播人數,機動人數,花絮人數,視訊切換人數,視訊連線人數,人數
0,1,博思網路進場,2024/1/2,1,2,2,0,4.5,5,進場,...,0,1,4,1,1,1,0,0,0,8
1,2,博思網路,2024/1/3,1,3,3,0,5.0,5,直播,...,0,1,4,1,1,1,0,0,0,8
2,3,奕樂科技,2024/1/10,1,10,3,0,6.0,3,直播,...,0,1,2,1,1,0,0,0,0,5
3,4,DTxAWS(進),2024/1/10,1,10,3,0,4.0,4,進場,...,1,0,0,0,0,4,0,0,0,4
4,5,DTxAWS,2024/1/11,1,11,4,0,10.0,4,直播,...,1,1,3,5,1,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,466,InnoVEX*PI Day4,2025/5/23,5,23,5,0,6.0,3,錄製,...,0,1,2,1,0,0,0,0,0,4
466,467,InnoVEX*Center Day4,2025/5/23,5,23,5,0,6.0,3,直播,...,0,1,2,1,1,0,0,0,0,5
467,468,異象醫藥 控台,2025/5/24,5,24,6,1,11.5,0,錄製,...,0,0,0,0,0,1,0,1,0,2
468,469,影響力量 視訊切換場控執行,2025/5/25,5,25,7,1,4.5,0,錄製,...,0,0,0,0,0,1,0,1,0,2


In [3]:
# 將與訓練無關的columns去除，並將原本為字串的欄位轉換為One-Hot coding

data.drop(columns = ['編號', '專案', '日期'], inplace = True)
data = pd.get_dummies(data, columns=['工作性質'])

data.columns

Index(['月', '日', '星期', '是否假日', '時長', '機位數量', '花絮', '視訊切換', '視訊連線', 'PA音控',
       '大場分小場', '導播人數', '攝影人數', '音控人數', '直播人數', '機動人數', '花絮人數', '視訊切換人數',
       '視訊連線人數', '人數', '工作性質_直播', '工作性質_進場', '工作性質_錄製'],
      dtype='object')

In [4]:
# 將特徵與標籤分離成 x, y

x = data.drop(columns = ['導播人數', '攝影人數', '音控人數', '直播人數', '機動人數', '花絮人數', '視訊切換人數', '視訊連線人數', '人數'])
y = data.drop(columns = ['月', '日', '星期', '是否假日', '時長', '機位數量', '花絮', '視訊切換', '視訊連線', 'PA音控', '大場分小場', '工作性質_直播', '工作性質_進場', '工作性質_錄製'])

print(x.columns)
print('------------------')
print(y.columns)

Index(['月', '日', '星期', '是否假日', '時長', '機位數量', '花絮', '視訊切換', '視訊連線', 'PA音控',
       '大場分小場', '工作性質_直播', '工作性質_進場', '工作性質_錄製'],
      dtype='object')
------------------
Index(['導播人數', '攝影人數', '音控人數', '直播人數', '機動人數', '花絮人數', '視訊切換人數', '視訊連線人數',
       '人數'],
      dtype='object')


In [5]:
# 將部分資料切割作為測試資料集

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

print('train size: ', x_train.shape)
print('test size: ', x_test.shape)

train size:  (376, 14)
test size:  (94, 14)


In [None]:
## 資料標準化，但本模型使用隨機森林，故可忽略

# scaler_x = StandardScaler().fit(x_train)
# x_train = scaler_x.transform(x_train)

# scaler_y = StandardScaler().fit(y_train)
# y_train = scaler_y.transform(y_train)

# print(x_train)


In [6]:
# 建立、訓練模型，並用測試資料集進行預測

model = model = MultiOutputRegressor(RandomForestRegressor(n_estimators = 150, min_samples_split = 2))
model.fit(x_train, y_train)


# x_test = scaler_x.transform(x_test)
# y_test = scaler_y.transform(y_test)


y_pred = model.predict(x_test)

In [7]:
# 將預測結果與ground truth比對，算MSE和RMSE

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('mean square error: {}'.format(mse))
print('R mean square error: {}'.format(rmse))

mean square error: 0.25705954094974826
R mean square error: 0.5070103953073825


In [8]:
# 保存模型

import joblib

joblib.dump(model, '20250612_RFM.pkl')
print("模型已保存為 20250612_RFM.pkl")

模型已保存為 20250612_RFM.pkl
