In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [2]:
# Read the dataset
cleaned_df = pd.read_csv('USDCHF_hourly.csv')
cleaned_df.fillna(method='bfill', inplace=True)

cleaned_df

  cleaned_df.fillna(method='bfill', inplace=True)


Unnamed: 0.1,Unnamed: 0,date,v,vw,o,c,h,l,n,prev_close,returns,outlier_id,day type,day
0,0,2010-05-19 00:00:00,9373.0,1.1507,1.15299,1.15113,1.153020,1.14899,9373.0,1.15113,-0.001520,1,prior day,Wednesday
1,1,2010-05-19 01:00:00,6584.0,1.1502,1.15114,1.14938,1.152190,1.14846,6584.0,1.15113,-0.001520,1,prior day,Wednesday
2,2,2010-05-19 02:00:00,4968.0,1.1489,1.14939,1.14758,1.149720,1.14758,4968.0,1.14938,-0.001566,1,prior day,Wednesday
3,3,2010-05-19 03:00:00,6480.0,1.1476,1.14753,1.14893,1.149590,1.14648,6480.0,1.14758,0.001176,1,prior day,Wednesday
4,4,2010-05-19 04:00:00,4950.0,1.1487,1.14893,1.14875,1.149810,1.14776,4950.0,1.14893,-0.000157,1,prior day,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22957,22957,2023-03-13 20:00:00,5483.0,0.9119,0.91153,0.91160,0.912410,0.91100,5483.0,0.91152,0.000088,140,post day,Monday
22958,22958,2023-03-13 21:00:00,1016.0,0.9115,0.91160,0.91170,0.912245,0.91050,1016.0,0.91160,0.000110,140,post day,Monday
22959,22959,2023-03-13 22:00:00,1750.0,0.9116,0.91150,0.91180,0.912110,0.91064,1750.0,0.91170,0.000110,140,post day,Monday
22960,22960,2023-03-13 23:00:00,2798.0,0.9122,0.91201,0.91253,0.912800,0.91148,2798.0,0.91180,0.000801,140,post day,Monday


In [3]:
sorted_df = cleaned_df.sort_values(by=['outlier_id', 'date'], ascending=True)

sorted_df

Unnamed: 0.1,Unnamed: 0,date,v,vw,o,c,h,l,n,prev_close,returns,outlier_id,day type,day
0,0,2010-05-19 00:00:00,9373.0,1.1507,1.15299,1.15113,1.153020,1.14899,9373.0,1.15113,-0.001520,1,prior day,Wednesday
1,1,2010-05-19 01:00:00,6584.0,1.1502,1.15114,1.14938,1.152190,1.14846,6584.0,1.15113,-0.001520,1,prior day,Wednesday
2,2,2010-05-19 02:00:00,4968.0,1.1489,1.14939,1.14758,1.149720,1.14758,4968.0,1.14938,-0.001566,1,prior day,Wednesday
3,3,2010-05-19 03:00:00,6480.0,1.1476,1.14753,1.14893,1.149590,1.14648,6480.0,1.14758,0.001176,1,prior day,Wednesday
4,4,2010-05-19 04:00:00,4950.0,1.1487,1.14893,1.14875,1.149810,1.14776,4950.0,1.14893,-0.000157,1,prior day,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22957,22957,2023-03-13 20:00:00,5483.0,0.9119,0.91153,0.91160,0.912410,0.91100,5483.0,0.91152,0.000088,140,post day,Monday
22958,22958,2023-03-13 21:00:00,1016.0,0.9115,0.91160,0.91170,0.912245,0.91050,1016.0,0.91160,0.000110,140,post day,Monday
22959,22959,2023-03-13 22:00:00,1750.0,0.9116,0.91150,0.91180,0.912110,0.91064,1750.0,0.91170,0.000110,140,post day,Monday
22960,22960,2023-03-13 23:00:00,2798.0,0.9122,0.91201,0.91253,0.912800,0.91148,2798.0,0.91180,0.000801,140,post day,Monday


In [4]:
# Initialize empty lists to store the train and test sets
train_dfs = []
test_dfs = []

# Iterate over each group of 'outlier_id'
for outlier_id, group in sorted_df.groupby('outlier_id'):
    # Select the first 96 data points for the train set
    train = group.iloc[:96]
    # Select the last 72 data points for the test set
    test = group.iloc[-72:]
    
    # Append the results to the lists
    train_dfs.append(train)
    test_dfs.append(test)

# Concatenate all the train and test sets from each group
train_set = pd.concat(train_dfs)
test_set = pd.concat(test_dfs)

# Reset index if needed
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)


In [7]:
final_dict = {}
sequence_length = 3  # Example sequence length for LSTM model
rmses = []

for outlier_id in train_set["outlier_id"].unique():
    # Filter data for the current outlier_id
    train_df = train_set[train_set["outlier_id"] == outlier_id].set_index("date")
    train_df.index = pd.to_datetime(train_df.index)  # Convert index to DateTimeIndex
    test_df = test_set[test_set["outlier_id"] == outlier_id]

    # Normalize data
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_df[["c"]])
    
    # Prepare data for LSTM model
    train_generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=1)
    
    # Define and compile LSTM model
    model = Sequential([
        LSTM(64, activation='relu', input_shape=(sequence_length, 1)),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Fit the model
    model.fit(train_generator, epochs=20, verbose=0)
    
    # Prepare last sequence for forecasting
    last_sequence = train_scaled[-sequence_length:]
    
    # Iteratively forecast the next 3 steps
    forecast_steps = 3
    predictions_scaled = []
    for _ in range(forecast_steps):
        # Reshape the last sequence for prediction
        last_sequence_reshaped = last_sequence.reshape((1, sequence_length, 1))
        # Predict the next step and append to predictions
        next_step_pred = model.predict(last_sequence_reshaped, verbose=0)
        predictions_scaled.append(next_step_pred.ravel()[0])
        # Update the last sequence with the prediction
        last_sequence = np.roll(last_sequence, -1)
        last_sequence[-1] = next_step_pred
    
    # Inverse transform predictions
    predictions_inv = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
    
    # Compute RMSE for each outlier_id
    mse = mean_squared_error(test_df["c"].values[:forecast_steps], predictions_inv)
    rmse = np.sqrt(mse)
    rmses.append(rmse)

  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()




  super().__init__(**kwargs)
  self._warn_if_super_not_called()




  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._warn_if_super_not_called()
  super().__init__(**kwargs)
  self._war

In [8]:
rmse = pd.DataFrame(rmses)

rmse

Unnamed: 0,0
0,0.000730
1,0.002016
2,0.002482
3,0.001407
4,0.000942
...,...
135,0.002679
136,0.000652
137,0.005474
138,0.001113


In [9]:
rmse.to_csv("rmse_3steps.csv")