In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import tqdm

In [4]:
# Load in data
with open('OQ_lists.pkl', 'rb') as f:
    data = pickle.load(f)

test = data[0]
train = data[1]
validate = data[2]

In [2]:
from pykalman import KalmanFilter

In [5]:
np_train = []
for tm in train:
    tm_np = np.array(tm)
    np_train.append(tm_np)

In [6]:
hacky = [np.concatenate([tm, np.ma.masked_equal(np.zeros(100), 0)]) for tm in np_train]
hacky = np.concatenate(hacky)

In [7]:
val_np = []
for val in validate:
    val_np.append(np.array(val))

In [8]:
# Create a KalmanFilter object
kf5 = KalmanFilter(n_dim_state=5,n_dim_obs=1)


# Fit the Kalman filter to the observations using EM
kf5.em(hacky, n_iter=5)


# Retrieve the estimated matrices
F5 = kf5.transition_matrices
H5 = kf5.observation_matrices
Q5 = kf5.transition_covariance
R5 = kf5.observation_covariance

print("Estimated Transition Matrix (F):")
print(F5)

print("\nEstimated Observation Matrix (H):")
print(H5)

print("\nEstimated Process Noise Covariance Matrix (Q):")
print(Q5)

print("\nEstimated Observation Noise Covariance Matrix (R):")
print(R5)

Estimated Transition Matrix (F):
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]

Estimated Observation Matrix (H):
[[1. 0. 0. 0. 0.]]

Estimated Process Noise Covariance Matrix (Q):
[[41.62418218  0.          0.          0.          0.        ]
 [ 0.          1.          0.          0.          0.        ]
 [ 0.          0.          1.          0.          0.        ]
 [ 0.          0.          0.          1.          0.        ]
 [ 0.          0.          0.          0.          1.        ]]

Estimated Observation Noise Covariance Matrix (R):
[[37.66090425]]


In [9]:
# Predict using the Kalman Filter
filtered_state_means5, _ = kf5.filter(hacky)
# Randomly select nine time series from the validation data
selected_time_series = np.random.choice(val_np, 9, replace=False)

# Perform prediction for each selected time series
for tm in selected_time_series:
    hidden_state_mean, _ = kf5.filter(tm)
    sample = kf5.sample(1, initial_state=hidden_state_mean[-1])
    print(tm)
    print(sample[1][0])

[75. 91.]
[72.34315888]
[68. 65. 63. 66. 62. 74. 70. 74. 65. 68. 69. 70. 66.]
[60.87650257]
[80. 73.]
[74.3462729]
[82. 83. 90. 85. 79. 77.]
[76.75383663]
[36. 24. 13. 20. 49. 11. 36. 40. 17. 95. 73. 23. 35.]
[24.56187504]
[110.  85.  93.  81.]
[82.78005403]
[68.]
[53.46587327]
[53. 50. 50. 38. 32.]
[33.35171051]
[74. 84.]
[67.71024424]


  selected_time_series = np.random.choice(val_np, 9, replace=False)


In [10]:
# Create a KalmanFilter object
kf7 = KalmanFilter(n_dim_state=7,n_dim_obs=1)


# Fit the Kalman filter to the observations using EM
kf7.em(hacky, n_iter=5)


# Retrieve the estimated matrices
F7 = kf7.transition_matrices
H7 = kf7.observation_matrices
Q7 = kf7.transition_covariance
R7 = kf7.observation_covariance

print("Estimated Transition Matrix (F):")
print(F7)

print("\nEstimated Observation Matrix (H):")
print(H7)

print("\nEstimated Process Noise Covariance Matrix (Q):")
print(Q7)

print("\nEstimated Observation Noise Covariance Matrix (R):")
print(R7)

# Predict using the Kalman Filter
filtered_state_means7, _ = kf7.filter(hacky)
# Randomly select nine time series from the validation data
selected_time_series = np.random.choice(val_np, 9, replace=False)

# Perform prediction for each selected time series
for tm in selected_time_series:
    hidden_state_mean, _ = kf7.filter(tm)
    sample = kf7.sample(1, initial_state=hidden_state_mean[-1])
    print(tm)
    print(sample[1][0])

Estimated Transition Matrix (F):
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]]

Estimated Observation Matrix (H):
[[1. 0. 0. 0. 0. 0. 0.]]

Estimated Process Noise Covariance Matrix (Q):
[[41.62418218  0.          0.          0.          0.          0.
   0.        ]
 [ 0.          1.          0.          0.          0.          0.
   0.        ]
 [ 0.          0.          1.          0.          0.          0.
   0.        ]
 [ 0.          0.          0.          1.          0.          0.
   0.        ]
 [ 0.          0.          0.          0.          1.          0.
   0.        ]
 [ 0.          0.          0.          0.          0.          1.
   0.        ]
 [ 0.          0.          0.          0.          0.          0.
   1.        ]]

Estimated Observation Noise Covariance Matrix (R):
[[37.66090425]]
[68. 85. 87. 69.]
[82.10122281]
[38. 35.]
[51.51965706

  selected_time_series = np.random.choice(val_np, 9, replace=False)


In [11]:
# Create a KalmanFilter object
kf10 = KalmanFilter(n_dim_state=10,n_dim_obs=1)


# Fit the Kalman filter to the observations using EM
kf10.em(hacky, n_iter=5)


# Retrieve the estimated matrices
F10 = kf10.transition_matrices
H10 = kf10.observation_matrices
Q10 = kf10.transition_covariance
R10 = kf10.observation_covariance

print("Estimated Transition Matrix (F):")
print(F10)

print("\nEstimated Observation Matrix (H):")
print(H10)

print("\nEstimated Process Noise Covariance Matrix (Q):")
print(Q10)

print("\nEstimated Observation Noise Covariance Matrix (R):")
print(R10)

# Predict using the Kalman Filter
filtered_state_means10, _ = kf10.filter(hacky)
# Randomly select nine time series from the validation data
selected_time_series = np.random.choice(val_np, 9, replace=False)

# Perform prediction for each selected time series
for tm in selected_time_series:
    hidden_state_mean, _ = kf10.filter(tm)
    sample = kf10.sample(1, initial_state=hidden_state_mean[-1])
    print(tm)
    print(sample[1][0])

Estimated Transition Matrix (F):
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

Estimated Observation Matrix (H):
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Estimated Process Noise Covariance Matrix (Q):
[[41.62418218  0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          1.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          1.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          1.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          

  selected_time_series = np.random.choice(val_np, 9, replace=False)


In [12]:
import numpy as np
import pickle

def get_prediction_mse(model, datafile='eval_data.pkl'):
    """
    Given an object (model) that implements a predict function, calculate the MSE using the eval data.
    Predict function must take in a numpy array of previous OQs and guess the next one as a float.
    """
    errors = []
    
    # Load evaluation data
    with open(datafile, 'rb') as file:
        eval_data_df = pickle.load(file)
    
    # Iterate over each row in the evaluation data
    for _, row in eval_data_df.iterrows():
        # Extract previous observations and true next observation
        previous_obs = np.array(row['Previous'])
        true_next_obs = row['Next']
        
        # Get predicted next observation from the model
        predicted_next_obs = model.predict(previous_obs)
        
        # Calculate squared error and append to errors list
        error = (predicted_next_obs - true_next_obs) ** 2
        errors.append(error)
    
    # Calculate the mean squared error
    mse = np.mean(errors)
    
    return mse


In [15]:
class KalmanFilterWrapper:
    def __init__(self, kf):
        self.kf = kf
    
    def predict(self, previous_obs):
        # Use the Kalman filter object to predict the next observation
        hidden_state_mean, _ = self.kf.filter(previous_obs)
        predicted_next_obs = self.kf.sample(1, initial_state=hidden_state_mean[-1])[1][0]
        return predicted_next_obs



# Print the calculated MSE
print("Mean Squared Error 5 dimensions:", get_prediction_mse(KalmanFilterWrapper(kf5)))
print("Mean Squared Error 7 dimensions:", get_prediction_mse(KalmanFilterWrapper(kf7)))
print("Mean Squared Error 10 dimensions:", get_prediction_mse(KalmanFilterWrapper(kf10)))


Mean Squared Error 5 dimensions: 350.0620642268017
Mean Squared Error 7 dimensions: 351.8625629385364
Mean Squared Error 10 dimensions: 348.53526920462696
