In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv("datas/5min_MYM_只用IB量價資料_regression.csv")


In [None]:
data = pd.read_csv("datas/5min_MYM_只用IB量價資料_class.csv")


In [3]:
data_df=data.copy()
data_df['datetime']=pd.to_datetime(data_df['datetime'])
# 移除15:00~17:00以及03:00~04:00的資料
data_df_清洗 = data_df[
    (data_df["datetime"].dt.time >= pd.to_datetime("17:00:00").time())
    | (data_df["datetime"].dt.time <= pd.to_datetime("03:00:00").time())
]
# data.drop(["datetime"], axis=1, inplace=True)

In [4]:
numeric_cols = data_df_清洗.select_dtypes(include=["number"]).columns
filtered_rows = data_df_清洗[(data_df_清洗[numeric_cols] > 999999999).any(axis=1)]
data_df_清洗 = filtered_rows.dropna()
# 檢查篩選後的行
# print(filtered_rows)

In [5]:
# 13873 row
df_train=data_df_清洗.iloc[:10000]
df_validation=data_df_清洗.iloc[10000:13000]
df_predict = data_df_清洗.iloc[13000:13800]

In [6]:
# 假設 data 是您的 dataframe
# 先定義特徵和目標變數
features = [
            "open",
            "high",
            "low",
            "close",
            "volume",
            "ma_5",
            "ma_10",
            "ma_20",
            "volume_ma_5",
            "volume_ma_10",
            "volume_ma_20",
            "cumulative_volume",
            "cumulative_volume_price",
            "VWAP",
            "lowest_low",
            "highest_high",
            "%K",
            "%D",
            "RSI",
            "MACD",
            "Signal_Line",
            "MACD_Histogram",
            "BB_middle",
            "BB_upper",
            "BB_lower",
            "return_15m",
            "high_return_15m",
            "low_return_15m",
            "return_30m",
            "high_return_30m",
            "low_return_30m",
            "return_60m",
            "high_return_60m",
            "low_return_60m",
            "return_120m",
            "high_return_120m",
            "low_return_120m",
            "return_240m",
            "high_return_240m",
            "low_return_240m",
            "minutes_from_open",
]
targets = [
    "future_5min_change",
    "future_10min_change",
    "future_15min_change",
    "future_30min_change",
    "future_60min_change",
    "future_90min_change",
    "future_120min_change",
    "future_180min_change",
    "future_240min_change",
]

In [7]:
# 標準化特徵資料
def scale_data(train_df, val_df, test_df):
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_df[features])
    val_scaled = scaler.transform(val_df[features])
    test_scaled = scaler.transform(test_df[features])
    return train_scaled, val_scaled, test_scaled, scaler

# 建立移動窗格資料
def create_rolling_window(data, target, window_size=10):
    X = []
    y = []
    for i in range(window_size, len(data)):
        X.append(data[i - window_size:i])
        y.append(target[i])
    return np.array(X), np.array(y)

# 建立 LSTM 模型
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(len(targets)))
    model.compile(optimizer='adam', loss='mse')
    return model

# 訓練函數
def train(df1, epochs=50, batch_size=32):
    X_train_scaled, _, _, scaler = scale_data(df1, df1, df1)  # 標準化資料
    target_train = df1[targets].values
    X_train, y_train = create_rolling_window(X_train_scaled, target_train)

    model = create_lstm_model((X_train.shape[1], X_train.shape[2]))
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[early_stopping])
    return model, scaler

# 驗證函數
def validation(model, scaler, df2):
    _, X_val_scaled, _, _ = scale_data(df2, df2, df2)
    target_val = df2[targets].values
    X_val, y_val = create_rolling_window(X_val_scaled, target_val)
    
    y_pred = model.predict(X_val)
    r2_scores = {}
    for i, target in enumerate(targets):
        r2_scores[target] = r2_score(y_val[:, i], y_pred[:, i])
        print(f'R2 Score for {target}: {r2_scores[target]}')
    return r2_scores

# 預測函數
def predict(model, scaler, df3):
    _, _, X_test_scaled, _ = scale_data(df3, df3, df3)
    target_test = df3[targets].values
    X_test, y_test = create_rolling_window(X_test_scaled, target_test)

    y_pred = model.predict(X_test)
    r2_scores = {}
    for i, target in enumerate(targets):
        r2_scores[target] = r2_score(y_test[:, i], y_pred[:, i])
        print(f'R2 Score for {target}: {r2_scores[target]}')
    return y_pred, r2_scores

# 執行訓練、驗證、預測
model, scaler = train(df_train)
validation_r2_scores = validation(model, scaler, df_validation)
# predictions, predict_r2_scores = predict(model, scaler, df_predict)

  super().__init__(**kwargs)


Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - loss: 0.3156 - val_loss: 0.3806
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 0.1894 - val_loss: 0.3338
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 0.1349 - val_loss: 0.5973
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 0.1060 - val_loss: 0.4920
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - loss: 0.0873 - val_loss: 0.4740
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - loss: 0.0724 - val_loss: 0.5830
Epoch 7/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - loss: 0.0616 - val_loss: 0.5338
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step
R2 Score for future_5min_change: -0.4313299834162809
R2 Score for future_10min_ch