In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [4]:
def prepare_sequences(df):
    sequences = []
    labels = []
    
    unique_ids = df['outlier_id'].unique()
    
    for oid in unique_ids:
        prior_data = df[(df['outlier_id'] == oid) & (df['day type'].isin(['prior day', 'outlier day']))]['c'].values
        post_data = df[(df['outlier_id'] == oid) & (df['day type'] == 'post day')]['c'].values
        
        if len(prior_data) >= 72 and len(post_data) >= 72:  # Ensure full sequences
            sequences.append(prior_data[-72:])
            labels.append(post_data[:72])  # Only the first 3 post days are needed
            
    return np.array(sequences), np.array(labels)

# Load your dataset
df = pd.read_csv('USDCHF_hourly.csv')  # Make sure to load your actual data file

# Clean the data - Handle missing values.
df.drop(columns= ['prev_close', 'returns', 'day'], inplace=True)
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)

# Normalize each sequence
scaler = MinMaxScaler(feature_range=(0, 1))
df['c'] = scaler.fit_transform(df[['c']])

X, y = prepare_sequences(df)

# Reshape for LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))
y = y.reshape((y.shape[0], 72))  # Predicting only the first 3 days of post day

# Define the LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(72, 1)),  # Input sequences are 72 time steps
    LSTM(50),
    Dense(72)  # Output 3 predictions
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

# Split data into train and test sets
train_size = int(len(X) * 0.80)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, verbose=2)

# Make predictions
predictions = model.predict(X_test)

# Optionally, scale back the predictions to the original scale
predictions = scaler.inverse_transform(predictions)
y_test = scaler.inverse_transform(y_test)

print("Model training complete and predictions made.")

  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


Epoch 1/100


  super().__init__(**kwargs)


2/2 - 2s - 901ms/step - loss: 0.2544 - val_loss: 0.1141
Epoch 2/100
2/2 - 0s - 61ms/step - loss: 0.1121 - val_loss: 0.0232
Epoch 3/100
2/2 - 0s - 60ms/step - loss: 0.0417 - val_loss: 0.0407
Epoch 4/100
2/2 - 0s - 63ms/step - loss: 0.0276 - val_loss: 0.0166
Epoch 5/100
2/2 - 0s - 60ms/step - loss: 0.0261 - val_loss: 0.0175
Epoch 6/100
2/2 - 0s - 59ms/step - loss: 0.0224 - val_loss: 0.0165
Epoch 7/100
2/2 - 0s - 58ms/step - loss: 0.0196 - val_loss: 0.0053
Epoch 8/100
2/2 - 0s - 62ms/step - loss: 0.0124 - val_loss: 0.0034
Epoch 9/100
2/2 - 0s - 63ms/step - loss: 0.0096 - val_loss: 0.0051
Epoch 10/100
2/2 - 0s - 62ms/step - loss: 0.0079 - val_loss: 0.0096
Epoch 11/100
2/2 - 0s - 61ms/step - loss: 0.0081 - val_loss: 0.0030
Epoch 12/100
2/2 - 0s - 61ms/step - loss: 0.0072 - val_loss: 0.0022
Epoch 13/100
2/2 - 0s - 58ms/step - loss: 0.0059 - val_loss: 0.0026
Epoch 14/100
2/2 - 0s - 58ms/step - loss: 0.0049 - val_loss: 9.1373e-04
Epoch 15/100
2/2 - 0s - 59ms/step - loss: 0.0044 - val_loss: 8.2

In [5]:
# Calculate MSE and MAE
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mse)  # Calculate RMSE
r2 = r2_score(y_test, predictions)  # Calculate R^2 score



print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Squared Error: 5.2965785380431166e-05
Mean Absolute Error: 0.00544120123938871
Root Mean Squared Error: 0.0072777596401936194
R-squared: 0.9394305414867303


In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense
# from tensorflow.keras.optimizers import Adam

# # Function to prepare sequences for training
# def prepare_sequences_train(df):
#     sequences = []
#     labels = []
    
#     unique_ids = df['outlier_id'].unique()
    
#     for oid in unique_ids:
#         prior_data = df[(df['outlier_id'] == oid) & (df['day type'] == 'prior day')]['c'].values
#         post_data = df[(df['outlier_id'] == oid) & (df['day type'] == 'post day')]['c'].values
        
#         if len(prior_data) == 72 and len(post_data) >= 72:
#             sequences.append(prior_data)
#             labels.append(post_data[:72])
#         else:
#             print(f"Skipping outlier_id {oid} due to insufficient prior or post data")
#     return np.array(sequences), np.array(labels)

# # Load and prepare training data
# def train_model(file_path):
#     df_train = pd.read_csv(file_path) # Augmented Data csv file
#     scaler = MinMaxScaler(feature_range=(0, 1))
#     df_train['c'] = scaler.fit_transform(df_train[['c']])
    
#     X_train, y_train = prepare_sequences_train(df_train)
#     X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
#     y_train = y_train.reshape((y_train.shape[0], 72))
    
#     model = Sequential([
#         LSTM(50, return_sequences=True, input_shape=(72, 1)),
#         LSTM(50),
#         Dense(72)
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')
#     model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=2)
#     return model, scaler

# # Function to prepare a single sequence for prediction
# def prepare_sequence_predict(df_prior, scaler):
#     prior_data = df_prior['c'].values # "df_prior" should be the dataframe containing prior day's data for the current outlier.
#     prior_data = scaler.transform(prior_data.reshape(-1, 1)).flatten()
#     return prior_data.reshape(1, 72, 1)

# # Function to predict using the model
# def predict(model, df_prior, scaler):
#     X_test = prepare_sequence_predict(df_prior, scaler)
#     predictions = model.predict(X_test)
#     return scaler.inverse_transform(predictions.reshape(-1, 1))

# # Function to validate predictions
# def validate_predictions(predictions, df_post, scaler):
#     post_data = df_post['c'].values[:72] # "df_post" should be the dataframe containing post day's data for the current outlier.
#     post_data_scaled = scaler.transform(post_data.reshape(-1, 1)).flatten()
    
#     mse = mean_squared_error(post_data_scaled, predictions.flatten())
#     mae = mean_absolute_error(post_data_scaled, predictions.flatten())
#     rmse = np.sqrt(mse)
#     r2 = r2_score(post_data_scaled, predictions.flatten())

#     print(f"Mean Squared Error: {mse}")
#     print(f"Mean Absolute Error: {mae}")
#     print(f"Root Mean Squared Error: {rmse}")
#     print(f"R-squared: {r2}")

# # Example usage:
# model, scaler = train_model('train_data.csv')
# predictions = predict(model, df_prior, scaler)
# validate_predictions(predictions, df_post, scaler)