# ta-lib-006 ( Feature‑Engineering + Sequence Models)
https://chatgpt.com/c/680b5d6e-3f64-800a-9f2b-c08e35b0d0e8
What's new:

1- Change logic from classification to regression.

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import talib
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt

# Ensure plots inline
%matplotlib inline

In [3]:
# Parameters
DATA_PATH = "../datasets/XAGUSD-H1-rates.csv"   # Path to your downloaded MT5 CSV
SEQUENCE_LENGTH = 30                # Number of past candles per sample
PREDICT_HORIZON  = 5                # How many candles ahead to detect a trend change
TEST_SIZE = 0.2
RANDOM_STATE = 42
BATCH_SIZE = 64
EPOCHS = 50

In [4]:
# Load & Inspect Data
df = pd.read_csv(DATA_PATH, sep='\t')

In [5]:
# Combine <DATE> and <TIME> into single datetime index
df['DATETIME'] = pd.to_datetime(df['<DATE>'] + ' ' + df['<TIME>'])
df.set_index('DATETIME', inplace=True)
df.drop(columns=['<DATE>', '<TIME>'], inplace=True)
df.head()

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,<VOL>,<SPREAD>
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-02 09:00:00,15.889,15.896,15.807,15.834,993,18451000,21
2015-01-02 10:00:00,15.836,15.864,15.822,15.834,1116,25764000,22
2015-01-02 11:00:00,15.836,15.838,15.79,15.822,1023,18885000,19
2015-01-02 12:00:00,15.827,15.828,15.725,15.772,1286,23593000,21
2015-01-02 13:00:00,15.781,15.786,15.667,15.734,1584,28750000,17


In [6]:
# Compute TA indicators and append to dataframe.
# e.g. RSI, ATR, MACD Histogram
df['rsi'] = talib.RSI(df['<CLOSE>'], timeperiod=14)
macd, macd_sig, macd_hist = talib.MACD(df['<CLOSE>'], fastperiod=12, slowperiod=26, signalperiod=9)
df['macd_hist'] = macd_hist
df['atr'] = talib.ATR(df['<HIGH>'], df['<LOW>'], df['<CLOSE>'], timeperiod=14)
df.dropna(inplace=True) # Remove missing values.

In [7]:
# Feature Engineering
feature_cols = ['<CLOSE>', 'rsi', 'macd_hist', 'atr']
scaler = StandardScaler()
scaled = scaler.fit_transform(df[feature_cols])

In [8]:
# Target Engineering (Multi-Step Future Prices)
target_col = '<CLOSE>'  # Predict future CLOSE prices

In [9]:
# Sequence Creation
X, y = [], []
for i in range(len(df) - SEQUENCE_LENGTH - PREDICT_HORIZON + 1):
    seq_x = scaled[i : i+SEQUENCE_LENGTH]
    future_y = df[target_col].iloc[i+SEQUENCE_LENGTH : i+SEQUENCE_LENGTH+PREDICT_HORIZON].values
    X.append(seq_x)
    y.append(future_y)
X = np.array(X)
y = np.array(y)

print("X shape:", X.shape)  # (samples, 30, 4)
print("y shape:", y.shape)  # (samples, 5)

X shape: (55016, 30, 4)
y shape: (55016, 5)


In [10]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)

Train: (44012, 30, 4) (44012, 5)
Test : (11004, 30, 4) (11004, 5)


In [11]:
# Build LSTM Regression Model
def build_regression_model(input_shape, output_horizon):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.2),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(output_horizon)  # Linear output: predicting raw future prices
    ])
    model.compile(
        loss='mse',                # Mean Squared Error for regression
        optimizer='adam',
        metrics=['mae']             # Mean Absolute Error as additional metric
    )
    return model

model = build_regression_model((SEQUENCE_LENGTH, len(feature_cols)), PREDICT_HORIZON)
model.summary()


I0000 00:00:1745685628.954637    5755 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2259 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1650 SUPER, pci bus id: 0000:01:00.0, compute capability: 7.5


In [12]:
# Training
es = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[es]
)

Epoch 1/50


I0000 00:00:1745685637.064428    5934 cuda_dnn.cc:529] Loaded cuDNN version 90800


[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 139.5451 - mae: 9.2195 - val_loss: 1.0309 - val_mae: 0.7189
Epoch 2/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 4.0996 - mae: 1.5969 - val_loss: 0.1311 - val_mae: 0.2591
Epoch 3/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 2.8548 - mae: 1.3169 - val_loss: 0.2248 - val_mae: 0.4201
Epoch 4/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 2.4007 - mae: 1.2028 - val_loss: 0.1764 - val_mae: 0.3196
Epoch 5/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 1.8997 - mae: 1.0609 - val_loss: 0.0709 - val_mae: 0.2146
Epoch 6/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 1.3942 - mae: 0.9015 - val_loss: 0.4242 - val_mae: 0.5695
Epoch 7/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.9

In [13]:
# Evaluation
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss (MSE): {loss:.4f}  Test MAE: {mae:.4f}")


[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0214 - mae: 0.1001
Test Loss (MSE): 0.0222  Test MAE: 0.1010


In [15]:
# --- New Code Starts Here ---

import matplotlib.pyplot as plt

# Load recent real candles for prediction
recent_candles = pd.read_csv("../datasets/new-data-for-test/rows-30-from-20240503.csv", sep='\t')

# # Combine <DATE> and <TIME> into single datetime index
recent_candles['DATETIME'] = pd.to_datetime(df['<DATE>'] + ' ' + df['<TIME>'])
recent_candles.set_index('DATETIME', inplace=True)
recent_candles.drop(columns=['<DATE>', '<TIME>'], inplace=True)


# Compute TA indicators and append to dataframe.
# e.g. RSI, ATR, MACD Histogram
recent_candles['rsi'] = talib.RSI(recent_candles['<CLOSE>'], timeperiod=14)
macd, macd_sig, macd_hist = talib.MACD(recent_candles['<CLOSE>'], fastperiod=12, slowperiod=26, signalperiod=9)
recent_candles['macd_hist'] = macd_hist
recent_candles['atr'] = talib.ATR(recent_candles['<HIGH>'], recent_candles['<LOW>'], recent_candles['<CLOSE>'], timeperiod=14)
recent_candles.dropna(inplace=True)


# Select the latest SEQUENCE_LENGTH candles
input_candles = recent_candles.tail(SEQUENCE_LENGTH)

# Scale the input using the same scaler from training
input_features = input_candles[feature_cols]
input_scaled = scaler.transform(input_features)
input_scaled = np.expand_dims(input_scaled, axis=0)  # (1, 30, 4)

# Predict future prices
predicted_future_prices = model.predict(input_scaled)[0]  # shape = (PREDICT_HORIZON,)
print("Predicted Future Prices:", predicted_future_prices)

# Prepare for plotting
# Get the last real CLOSE price
last_real_close = input_candles['<CLOSE>'].iloc[-1]

# Build X axis (time)
real_times = input_candles.index
future_times = pd.date_range(start=real_times[-1] + pd.Timedelta(hours=1), periods=PREDICT_HORIZON, freq='H')

# Plot real CLOSE prices
plt.figure(figsize=(12,6))
plt.plot(real_times, input_candles['<CLOSE>'], label='Real Close Prices (Input)', marker='o')

# Plot predicted future prices
plt.plot(future_times, predicted_future_prices, label='Predicted Future Close Prices', marker='x', linestyle='--', color='red')

# Decorations
plt.title(f"Prediction of Next {PREDICT_HORIZON} Future Candles")
plt.xlabel("Time")
plt.ylabel("Close Price")
plt.legend()
plt.grid()
plt.show()


KeyError: '<DATE>'