In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
data = pd.read_csv('new_data.csv')

data

Unnamed: 0,vw,o,c,h,l,n,prev_close,daily_return,abs_daily_return,pct_change,direction,recovery,outlier_id,day type,day,Date,Time
0,1.1342,1.13310,1.13420,1.13550,1.13256,5562.0,1.13310,0.000971,0.000971,0.097079,Up,fast recovery,1,prior day,Monday,2010-05-17,00:00:00
1,1.1340,1.13420,1.13438,1.13527,1.13275,5581.0,1.13420,0.000159,0.000159,0.015870,Up,fast recovery,1,prior day,Monday,2010-05-17,01:00:00
2,1.1403,1.13440,1.14112,1.14450,1.13438,11640.0,1.13438,0.005942,0.005942,0.594157,Up,fast recovery,1,prior day,Monday,2010-05-17,02:00:00
3,1.1411,1.14108,1.14149,1.14284,1.13929,5978.0,1.14112,0.000324,0.000324,0.032424,Up,fast recovery,1,prior day,Monday,2010-05-17,03:00:00
4,1.1405,1.14149,1.13926,1.14168,1.13918,4710.0,1.14149,-0.001954,0.001954,0.195359,Down,fast recovery,1,prior day,Monday,2010-05-17,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38776,0.9328,0.93164,0.93330,0.93400,0.93120,7392.0,0.93164,0.001782,0.001782,0.178180,Up,slow recovery,140,post day,Wednesday,2023-03-15,20:00:00
38777,0.9330,0.93330,0.93290,0.93340,0.93167,958.0,0.93330,-0.000429,0.000429,0.042859,Down,slow recovery,140,post day,Wednesday,2023-03-15,21:00:00
38778,0.9328,0.93303,0.93190,0.93430,0.93150,4998.0,0.93290,-0.001072,0.001072,0.107193,Down,slow recovery,140,post day,Wednesday,2023-03-15,22:00:00
38779,0.9313,0.93210,0.93126,0.93236,0.93070,8034.0,0.93190,-0.000687,0.000687,0.068677,Down,slow recovery,140,post day,Wednesday,2023-03-15,23:00:00


In [None]:
# Drop the unnecesary columns
data.drop(columns=['Unnamed: 0', 'prev_close', 'abs_daily_return', 'year'], inplace=True)

In [3]:
# Preprocess the data
data['direction_target'] = data['direction'].apply(lambda x: 1 if x == 'Up' else 0)
data['recovery_target'] = data['recovery'].apply(lambda x: 1 if x == 'fast recovery' else 0)
data.drop(columns=['direction', 'recovery'], inplace=True)

# Filter the data
input_data = data[data['day type'].isin(['prior day', 'outlier day'])]
output_data = data[data['day type'] == 'post day']

# Ensure the data is sorted by Date and Time
input_data = input_data.sort_values(by=['outlier_id', 'Date', 'Time'])
output_data = output_data.sort_values(by=['outlier_id', 'Date', 'Time'])

feature_columns = ['c', 'daily_return', 'direction_target']
seq_length = 24

def create_sequences_per_outlier(input_data, output_data, feature_columns, seq_length=24):
    sequences = []
    direction_targets = []
    recovery_targets = []
    
    unique_ids = input_data['outlier_id'].unique()
    
    for oid in unique_ids:
        input_seq = input_data[input_data['outlier_id'] == oid]
        output_seq = output_data[output_data['outlier_id'] == oid]

        # Normalize features
        scaler = MinMaxScaler()
        input_features = scaler.fit_transform(input_seq[feature_columns])
        
        # Ensure sequence length is exactly `seq_length` for input features
        if len(input_features) >= seq_length:
            input_features = input_features[-120:]  # Take last `seq_length` entries
        else:
            # Pad input_features if they are shorter than seq_length
            padding = np.zeros((seq_length - len(input_features), input_features.shape[1]))
            input_features = np.vstack([padding, input_features])

        # Same for output sequences
        if len(output_seq) >= seq_length:
            directions = output_seq['direction_target'].iloc[:seq_length].values
        else:
            # Padding direction and recovery targets if they are shorter than seq_length
            directions = np.pad(output_seq['direction_target'].values, (seq_length - len(output_seq), 0), 'constant')

        # Store sequences and targets as arrays
        sequences.append(input_features)
        direction_targets.append(directions)
        recovery_targets.append(recoveries)
    
    return np.array(sequences), np.array(direction_targets)

# Create sequences
sequences, direction_targets = create_sequences_per_outlier(input_data, output_data, feature_columns, seq_length)

# Convert to numpy array for model input
X = np.array(sequences)

# Split the data into training and testing sets based on outlier IDs
outlier_ids = input_data['outlier_id'].unique()
train_ids, test_ids = train_test_split(outlier_ids, test_size=0.3, random_state=123)

# Create indices for the sequences
train_idx = [i for i, oid in enumerate(outlier_ids) if oid in train_ids]
test_idx = [i for i, oid in enumerate(outlier_ids) if oid in test_ids]

X_train = X[train_idx]
X_test = X[test_idx]
y_train_dir = direction_targets[train_idx]
y_test_dir = direction_targets[test_idx]

In [5]:
def convert_to_one_hot(y, num_classes):
    return np.array([to_categorical(y_i, num_classes=num_classes) for y_i in y])

# Convert target labels to one-hot encoded format
y_train_dir_one_hot = convert_to_one_hot(y_train_dir, num_classes=2)
y_test_dir_one_hot = convert_to_one_hot(y_test_dir, num_classes=2)

def slice_last_timesteps(x):
    return x[:, :24, :]

# Define the LSTM model for direction prediction with fixed 24 time step outputs
model_direction = Sequential()
model_direction.add(LSTM(100, return_sequences=True, input_shape=(None, X_train.shape[-1])))  # Input shape matches features count
model_direction.add(Dropout(0.3))
model_direction.add(LSTM(100, return_sequences=True))
model_direction.add(Dropout(0.3))
model_direction.add(Lambda(slice_last_timesteps))  # Slice the first 24 time steps
model_direction.add(TimeDistributed(Dense(2, activation='softmax')))

# Compile the model
model_direction.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_direction.fit(X_train, y_train_dir_one_hot, epochs=100, batch_size=32, validation_data=(X_test, y_test_dir_one_hot))

  super().__init__(**kwargs)


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 386ms/step - accuracy: 0.4971 - loss: 0.6932 - val_accuracy: 0.5308 - val_loss: 0.6886
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - accuracy: 0.5242 - loss: 0.6936 - val_accuracy: 0.5308 - val_loss: 0.6887
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 161ms/step - accuracy: 0.5329 - loss: 0.6915 - val_accuracy: 0.5337 - val_loss: 0.6903
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 156ms/step - accuracy: 0.5353 - loss: 0.6909 - val_accuracy: 0.5308 - val_loss: 0.6893
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step - accuracy: 0.5287 - loss: 0.6935 - val_accuracy: 0.5308 - val_loss: 0.6891
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step - accuracy: 0.5323 - loss: 0.6902 - val_accuracy: 0.5308 - val_loss: 0.6892
Epoch 7/100
[1m4/4[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x105ddfef0>

In [6]:
# 1. Make predictions on the test set
y_test_pred_prob = model_direction.predict(X_test)

# 2. Convert predictions and true labels from one-hot to class labels
y_test_pred = np.argmax(y_test_pred_prob, axis=-1)
y_test_true = np.argmax(y_test_dir_one_hot, axis=-1)

# Flatten arrays to make them 1D for metric calculations
y_test_pred_flat = y_test_pred.flatten()
y_test_true_flat = y_test_true.flatten()

# 3. Calculate evaluation metrics
accuracy = accuracy_score(y_test_true_flat, y_test_pred_flat)
precision = precision_score(y_test_true_flat, y_test_pred_flat)
recall = recall_score(y_test_true_flat, y_test_pred_flat)
f1 = f1_score(y_test_true_flat, y_test_pred_flat)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 316ms/step
Accuracy: 0.5317
Precision: 0.5051
Recall: 0.1057
F1 Score: 0.1748
