In [3]:
import torch
import gpytorch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from tqdm import tqdm

# Prepare Data

In [4]:
data_dir = "../data/Jan1524_Jan2224/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    data.append(df.loc[:, ['pm25', 'longitude', 'latitude']].to_numpy())
data = np.array(data).transpose(1, 0, 2)
X = data[:, :, 1:]
Y = data[:, :, 0]
print(X.shape, Y.shape)

(10082, 8, 2) (10082, 8)


# Interpolation Algorithm

In [5]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel()
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# LOOCV

In [6]:
n_steps, n_stations = X.shape[:2]

In [27]:
RMSEs = []
MAEs = []
for t in tqdm(np.random.permutation(n_steps)[:120]):
    for i in range(n_stations):
        try:
            # Leave one out split
            X_train = torch.from_numpy(np.concatenate((X[t, :i], X[t, i+1:]), axis=0))
            X_test = torch.from_numpy(X[t, i:i+1])
            Y_train = torch.from_numpy(np.concatenate((Y[t, :i], Y[t, i+1:]), axis=0))
            Y_test = torch.from_numpy(Y[t, i:i+1])
            # prepare training
            likelihood = gpytorch.likelihoods.GaussianLikelihood()
            model = ExactGPModel(X_train, Y_train, likelihood)
            training_iter = 10000
            model.train()
            likelihood.train()
            optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
            # training
            for iter in range(training_iter):
                model.train()
                likelihood.train()
                optimizer.zero_grad()
                output = model(X_train)
                loss = -mll(output, Y_train)
                loss.backward()
                optimizer.step()
            # evaluation
            model.eval()
            likelihood.eval()
            Y_pred = model(X_test)
            Y_pred_mean = Y_pred.mean.detach()
            rmse = torch.sqrt(torch.mean((Y_pred_mean - Y_test) ** 2))
            mae = torch.mean(torch.abs(Y_pred_mean - Y_test))
            RMSEs.append(rmse)
            MAEs.append(mae)
        except:
            continue
            

100%|██████████| 120/120 [3:00:40<00:00, 90.34s/it]   


In [29]:
torch.mean(torch.stack(RMSEs))

tensor(2.2488, dtype=torch.float64)