In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Masking, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import Adam


# Load the dataset
data = pd.read_csv('/Users/priyakundu/Downloads/USDCHF_hourly_20.csv')
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

# Encode categorical variables before splitting the data
le_direction = LabelEncoder()
le_recovery = LabelEncoder()
data['direction_encoded'] = le_direction.fit_transform(data['direction'])
data['recovery_encoded'] = le_recovery.fit_transform(data['Recovery'])

# Filter data based on 'day type'
data_prior = data[data['day type'] == 'prior day']
data_post = data[data['day type'] == 'post day']

# Group and create sequences
grouped_prior = data_prior.groupby('outlier_id').apply(lambda x: x[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change', 'direction_encoded', 'recovery_encoded']].values)
grouped_post = data_post.groupby('outlier_id').apply(lambda x: x[['direction_encoded', 'recovery_encoded']].values)
sequences_prior = [group for group in grouped_prior]
sequences_post = [group for group in grouped_post]  # Get all columns for output including encoded ones

# Calculate the maximum sequence length across both inputs and outputs
max_seq_length_inputs = max(len(seq) for seq in sequences_prior)
max_seq_length_outputs = max(len(seq) for seq in sequences_post)
max_seq_length = max(max_seq_length_inputs, max_seq_length_outputs)

# Pad both inputs and outputs to the maximum sequence length
padded_inputs = pad_sequences(sequences_prior, maxlen=max_seq_length, padding='post', dtype='float32', value=-1)
padded_outputs = pad_sequences(sequences_post, maxlen=max_seq_length, padding='post', dtype='float32', value=-1)

# Split data into training and testing sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(padded_inputs, padded_outputs, test_size=0.3, random_state=42)

# Model definition
input_layer = Input(shape=(None, padded_inputs.shape[2]))
mask_layer = Masking(mask_value=-1)(input_layer)
lstm_layer = LSTM(50, return_sequences=True)(mask_layer)
output_layer_direction = TimeDistributed(Dense(1, activation='sigmoid'), name='direction')(lstm_layer)
output_layer_recovery = TimeDistributed(Dense(1, activation='sigmoid'), name='recovery')(lstm_layer)

model = Model(inputs=input_layer, outputs=[output_layer_direction, output_layer_recovery])

# Ensuring the output shape matches the target shape
model.compile(optimizer=Adam(learning_rate=0.005), 
              loss='binary_crossentropy', 
              metrics={'direction': ['accuracy'],
                       'recovery': ['accuracy']})

# Make sure targets are the right shape
train_outputs_dir = train_outputs[:,:,0].reshape(-1, train_outputs.shape[1], 1)
train_outputs_rec = train_outputs[:,:,1].reshape(-1, train_outputs.shape[1], 1)
test_outputs_dir = test_outputs[:,:,0].reshape(-1, test_outputs.shape[1], 1)
test_outputs_rec = test_outputs[:,:,1].reshape(-1, test_outputs.shape[1], 1)

# Fit the model
model.fit(train_inputs, [train_outputs_dir, train_outputs_rec], 
          batch_size=32, epochs=200, 
          validation_data=(test_inputs, [test_outputs_dir, test_outputs_rec]))

  data.fillna(method='ffill', inplace=True)
  data.fillna(method='bfill', inplace=True)
  grouped_prior = data_prior.groupby('outlier_id').apply(lambda x: x[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change', 'direction_encoded', 'recovery_encoded']].values)
  grouped_post = data_post.groupby('outlier_id').apply(lambda x: x[['direction_encoded', 'recovery_encoded']].values)


Epoch 1/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 574ms/step - direction_accuracy: 0.1746 - loss: 1.9747 - recovery_accuracy: 0.0068 - val_direction_accuracy: 0.1886 - val_loss: 1.0937 - val_recovery_accuracy: 0.3641
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 184ms/step - direction_accuracy: 0.1778 - loss: 0.8982 - recovery_accuracy: 0.3426 - val_direction_accuracy: 0.1886 - val_loss: -0.0910 - val_recovery_accuracy: 0.3641
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 139ms/step - direction_accuracy: 0.1725 - loss: -0.1019 - recovery_accuracy: 0.3339 - val_direction_accuracy: 0.1886 - val_loss: -0.4235 - val_recovery_accuracy: 0.3641
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 172ms/step - direction_accuracy: 0.1812 - loss: -0.3458 - recovery_accuracy: 0.3526 - val_direction_accuracy: 0.1886 - val_loss: -0.8048 - val_recovery_accuracy: 0.3641
Epoch 5/200
[1m7/7[0m [32m━

<keras.src.callbacks.history.History at 0x2900b1a60>

In [2]:
# Evaluate the model on the test set
test_metrics = model.evaluate(test_inputs, [test_outputs_dir, test_outputs_rec], verbose=0)
print("Test Set Evaluation:")

# Mapping the metrics names to their values
metrics_names = ['total_loss'] + [f"{name}_{metric}" for name in model.output_names for metric in ['accuracy']]
for name, value in zip(metrics_names, test_metrics):
    print(f"{name}: {value:.4f}")

Test Set Evaluation:
total_loss: -4.4852
direction_accuracy: 0.1886
recovery_accuracy: 0.3641


### For daily data points:
Test Set Evaluation:
total_loss: 0.9349
direction_accuracy: 0.3056
recovery_accuracy: 0.2976

### For Hourly data points - 10 outliers
Test Set Evaluation:
total_loss: -3.6716
direction_accuracy: 0.2141
recovery_accuracy: 0.4048

### For Hourly data points - 20 outliers
Test Set Evaluation:
total_loss: -3.3348
direction_accuracy: 0.1886
recovery_accuracy: 0.3641
