In [1]:
import pandas as pd
import numpy as np
import pywt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
data = pd.read_csv('USDCHF_hourly_20.csv')

data

Unnamed: 0.1,Unnamed: 0,vw,o,c,h,l,n,prev_close,daily_return,abs_daily_return,pct_change,direction,Recovery,outlier_id,day type,day,Date,Time
0,0,1.1095,1.111100,1.10928,1.11121,1.107730,7365.0,1.10928,-0.000117,0.000117,0.011719,Down,fast recovery,1,prior day,Tuesday,2010-05-11,00:00:00
1,1,1.1089,1.109280,1.10915,1.10983,1.107420,4139.0,1.10928,-0.000117,0.000117,0.011719,Down,fast recovery,1,prior day,Tuesday,2010-05-11,01:00:00
2,2,1.1093,1.109100,1.10979,1.11040,1.108100,3464.0,1.10915,0.000577,0.000577,0.057702,Up,fast recovery,1,prior day,Tuesday,2010-05-11,02:00:00
3,3,1.1103,1.109790,1.11042,1.11100,1.109630,2906.0,1.10979,0.000568,0.000568,0.056767,Up,fast recovery,1,prior day,Tuesday,2010-05-11,03:00:00
4,4,1.1112,1.110420,1.10990,1.11253,1.109700,5889.0,1.11042,-0.000468,0.000468,0.046829,Down,fast recovery,1,prior day,Tuesday,2010-05-11,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53189,53189,0.9258,0.925610,0.92641,0.92675,0.924800,8906.0,0.92560,0.000875,0.000875,0.087511,Up,fast recovery,280,post day,Friday,2023-03-17,19:00:00
53190,53190,0.9263,0.926400,0.92580,0.92740,0.925690,6548.0,0.92641,-0.000658,0.000658,0.065846,Down,fast recovery,280,post day,Friday,2023-03-17,20:00:00
53191,53191,0.9251,0.925900,0.92280,0.92660,0.922755,402.0,0.92580,-0.003240,0.003240,0.324044,Down,fast recovery,280,post day,Friday,2023-03-17,21:00:00
53192,53192,0.9227,0.923000,0.92270,0.92310,0.922344,449.0,0.92280,-0.000108,0.000108,0.010837,Down,fast recovery,280,post day,Friday,2023-03-17,22:00:00


In [4]:
# Preprocess the data
# Create target variables
data['direction_target'] = data['direction'].apply(lambda x: 1 if x == 'Up' else 0)
data['recovery_target'] = data['Recovery'].apply(lambda x: 1 if x == 'fast recovery' else 0)
data.drop(columns=['direction', 'Recovery'], inplace=True)

# Filter the data
input_data = data[data['day type'].isin(['prior day', 'outlier day'])]
output_data = data[data['day type'] == 'post day']

# Ensure the data is sorted by Date and Time
input_data = input_data.sort_values(by=['outlier_id', 'Date', 'Time'])
output_data = output_data.sort_values(by=['outlier_id', 'Date', 'Time'])

feature_columns = ['vw', 'c', 'daily_return', 'n', 'direction_target', 'CWT_Mean']

def create_sequences_per_outlier(input_data, output_data, feature_columns, seq_length=24):
    sequences = []
    direction_targets = []
    recovery_targets = []
    
    unique_ids = input_data['outlier_id'].unique()
    
    for oid in unique_ids:
        input_seq = input_data[input_data['outlier_id'] == oid]
        output_seq = output_data[output_data['outlier_id'] == oid]

        # Normalize features
        scaler = MinMaxScaler()
        input_features = scaler.fit_transform(input_seq[feature_columns])

        # Check if there are enough rows in the output data
        if len(output_seq) >= seq_length:
            # Get direction and recovery targets for the first 24 data points of the post day
            directions = output_seq['direction_target'].iloc[:seq_length].values
            recoveries = output_seq['recovery_target'].iloc[:seq_length].values

            # Store sequences and targets as arrays
            sequences.append(input_features)
            direction_targets.append(directions)
            recovery_targets.append(recoveries)
            
    return sequences, np.array(direction_targets), np.array(recovery_targets)

# Create sequences
sequences, direction_targets, recovery_targets = create_sequences_per_outlier(input_data, output_data, feature_columns)

# Function to apply padding strategy
def apply_reflect_padding(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            # Calculate the number of rows needed to reach max_length
            padding_size = max_length - len(seq)
            # Create padding by reflecting the sequence
            if padding_size <= len(seq):
                padding = np.flipud(seq[:padding_size])
            else:
                # If padding_size is greater than the sequence length, repeat reflection
                repeat_n = int(np.ceil(padding_size / len(seq)))
                reflected_part = np.flipud(seq)
                padding = np.tile(reflected_part, (repeat_n, 1))[:padding_size]
            padded_sequence = np.vstack([seq, padding])
        else:
            padded_sequence = seq[:max_length]  # Truncate to max_length if necessary
        padded_sequences.append(padded_sequence)
    return np.array(padded_sequences)

# Example: Mean padding
max_length = max(len(seq) for seq in sequences)  # Find max length to pad
X = apply_reflect_padding(sequences, max_length)

# Split the data into training and testing sets based on outlier IDs
outlier_ids = input_data['outlier_id'].unique()
train_ids, test_ids = train_test_split(outlier_ids, test_size=0.3, random_state=123)

# Create indices for the sequences
train_idx = [i for i, oid in enumerate(outlier_ids) if oid in train_ids]
test_idx = [i for i, oid in enumerate(outlier_ids) if oid in test_ids]

X_train = X[train_idx]
X_test = X[test_idx]
y_train_dir = direction_targets[train_idx]
y_test_dir = direction_targets[test_idx]
y_train_rec = recovery_targets[train_idx]
y_test_rec = recovery_targets[test_idx]

print(f"X_train shape after adjustment: {X_train.shape}")
print(f"y_train_dir shape after adjustment: {y_train_dir.shape}")
print(f"y_train_rec shape after adjustment: {y_train_rec.shape}")
print(f"X_test shape after adjustment: {X_test.shape}")
print(f"y_test_dir shape after adjustment: {y_test_dir.shape}")
print(f"y_test_rec shape after adjustment: {y_test_rec.shape}")

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
def convert_to_one_hot(y, num_classes):
    return np.array([to_categorical(y_i, num_classes=num_classes) for y_i in y])

# Convert target labels to one-hot encoded format
y_train_dir_one_hot = convert_to_one_hot(y_train_dir, num_classes=2)
y_test_dir_one_hot = convert_to_one_hot(y_test_dir, num_classes=2)

def slice_last_timesteps(x):
    return x[:, :24, :]

# Define the LSTM model for direction prediction with fixed 24 time step outputs
model_direction = Sequential()
model_direction.add(LSTM(100, return_sequences=True, input_shape=(None, X_train.shape[-1])))  # Input shape matches features count
model_direction.add(Dropout(0.3))
model_direction.add(LSTM(100, return_sequences=True))
model_direction.add(Dropout(0.3))
model_direction.add(Lambda(slice_last_timesteps))  # Slice the first 24 time steps
model_direction.add(TimeDistributed(Dense(2, activation='softmax')))

# Compile the model
model_direction.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_direction.fit(X_train, y_train_dir_one_hot, epochs=100, batch_size=32, validation_data=(X_test, y_test_dir_one_hot))



# # Build the LSTM model for recovery prediction
# model_recovery = Sequential()
# model_recovery.add(LSTM(50, return_sequences=True, input_shape=(None, 1)))  # Accept variable input lengths
# model_recovery.add(Dropout(0.2))
# model_recovery.add(LSTM(50))
# model_recovery.add(Dropout(0.2))
# model_recovery.add(Dense(48, activation='relu'))  # Prepare to reshape for 24 time step output
# model_recovery.add(Reshape((24, 2)))  # Reshape output to 24 time steps
# model_recovery.add(TimeDistributed(Dense(2, activation='softmax')))

# # Compile the model
# model_recovery.compile(optimizer=Adam(learning_rate=0.005), loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# model_recovery.fit(X_train, y_train_rec, epochs=100, batch_size=32, validation_data=(X_test, y_test_rec))

In [None]:
# 1. Make predictions on the test set
y_test_pred_prob = model_direction.predict(X_test)

# 2. Convert predictions and true labels from one-hot to class labels
y_test_pred = np.argmax(y_test_pred_prob, axis=-1)
y_test_true = np.argmax(y_test_dir_one_hot, axis=-1)

# Flatten arrays to make them 1D for metric calculations
y_test_pred_flat = y_test_pred.flatten()
y_test_true_flat = y_test_true.flatten()

# 3. Calculate evaluation metrics
accuracy = accuracy_score(y_test_true_flat, y_test_pred_flat)
precision = precision_score(y_test_true_flat, y_test_pred_flat)
recall = recall_score(y_test_true_flat, y_test_pred_flat)
f1 = f1_score(y_test_true_flat, y_test_pred_flat)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Direction Prediction Accuracy: 0.5575396418571472 (Changing Random Seed)

In [None]:
# Preprocess the data
data['direction_target'] = data['direction'].apply(lambda x: 1 if x == 'Up' else 0)
data['recovery_target'] = data['Recovery'].apply(lambda x: 1 if x == 'fast recovery' else 0)
data.drop(columns=['direction', 'Recovery'], inplace=True)

# Filter the data
input_data = data[data['day type'].isin(['prior day', 'outlier day'])]
output_data = data[data['day type'] == 'post day']

# Ensure the data is sorted by Date and Time
input_data = input_data.sort_values(by=['outlier_id', 'Date', 'Time'])
output_data = output_data.sort_values(by=['outlier_id', 'Date', 'Time'])

feature_columns = ['vw', 'c', 'daily_return', 'n', 'direction_target', 'CWT_Mean']

def create_sequences_per_outlier(input_data, output_data, feature_columns, seq_length=48):
    sequences = []
    direction_targets = []
    recovery_targets = []
    
    unique_ids = input_data['outlier_id'].unique()
    
    for oid in unique_ids:
        input_seq = input_data[input_data['outlier_id'] == oid]
        output_seq = output_data[output_data['outlier_id'] == oid]

        # Normalize features
        scaler = MinMaxScaler()
        input_features = scaler.fit_transform(input_seq[feature_columns])

        # Check if there are enough rows in the input data
        if len(input_features) >= seq_length:
            # Use only the last 48 data points of the input features
            input_features = input_features[-seq_length:]

            # Get direction and recovery targets for the first 24 data points of the post day
            if len(output_seq) >= 24:  # Ensure there are enough data points in output_seq
                directions = output_seq['direction_target'].iloc[:24].values
                recoveries = output_seq['recovery_target'].iloc[:24].values

                # Store sequences and targets as arrays
                sequences.append(input_features)
                direction_targets.append(directions)
                recovery_targets.append(recoveries)
            
    return sequences, np.array(direction_targets), np.array(recovery_targets)

# Create sequences
sequences, direction_targets, recovery_targets = create_sequences_per_outlier(input_data, output_data, feature_columns)

# Convert to numpy array for model input
X = np.array(sequences)

# Split the data into training and testing sets based on outlier IDs
outlier_ids = input_data['outlier_id'].unique()
train_ids, test_ids = train_test_split(outlier_ids, test_size=0.3, random_state=123)

# Create indices for the sequences
train_idx = [i for i, oid in enumerate(outlier_ids) if oid in train_ids]
test_idx = [i for i, oid in enumerate(outlier_ids) if oid in test_ids]

X_train = X[train_idx]
X_test = X[test_idx]
y_train_dir = direction_targets[train_idx]
y_test_dir = direction_targets[test_idx]
y_train_rec = recovery_targets[train_idx]
y_test_rec = recovery_targets[test_idx]