In [2]:
import torch
import gpytorch
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [65]:
data_dir = "../data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    if df.isnull().values.any():
        print("One nan dropped")
        continue
    
    # decompose timestamp
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["weekday"] = df["timestamp"].dt.weekday
    
    df = df.loc[:, ["longitude", "latitude", "celsius", "humidity", "pressure", "month", "day", "weekday", "hour", "pm25"]]
    df = df.groupby(["month", "day", "hour", "weekday"]).mean().reset_index(drop=False)

    if len(df) < ((31 + 30 + 31) * 24):
        continue
    else:
        data.append(df)
data = np.array(data).transpose(1, 0, 2)
X = data[:, :, :-1]
Y = data[:, :, -1]
print(X.shape, Y.shape)


One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One nan dropped
One outlier dropped
One outlier dropped
One outlier dropped
One nan dropped
One outlier dropped
One outlier dropped
(2208, 35, 9) (2208, 35)


# Construct GP model

In [66]:
class LocalPeriodicKernel(gpytorch.kernels.Kernel):
    is_stationary = True

    def __init__(self, lp_ard=None, **kwargs):
        super().__init__(**kwargs)
        if lp_ard is not None:
            self.periodickernel = gpytorch.kernels.PeriodicKernel(arg_num_dims=lp_ard)
            self.rbfkernel = gpytorch.kernels.RBFKernel(arg_num_dims=lp_ard)
        else:
            self.periodickernel = gpytorch.kernels.PeriodicKernel()
            self.rbfkernel = gpytorch.kernels.RBFKernel()
        self.localperiodickernel = self.periodickernel * self.rbfkernel

    #kernel function
    def forward(self, x1, x2, **params):
        return self.localperiodickernel(x1, x2, **params)
    
class BaseKernel(gpytorch.kernels.Kernel):
    def __init__(self, matern_ard=None, lp_ard=None, **kwargs):
        super().__init__(**kwargs)
        if matern_ard is not None:
            self.maternkernel = gpytorch.kernels.MaternKernel(nu=0.5,ard_num_dims=matern_ard)
        else:
            self.maternkernel = gpytorch.kernels.MaternKernel(nu=0.5)
        if lp_ard is not None:
            self.localperiodickernel = LocalPeriodicKernel(lp_ard=lp_ard)
        else:
            self.localperiodickernel = LocalPeriodicKernel()

    def forward(self, x1, x2, **params):
        # separate input into conutinuous and periodic components
        x1_per = x1[:, :4]
        x1_cont = x1[:, 4:]
        x2_per = x2[:, :4]
        x2_cont = x2[:, 4:]
        return self.maternkernel(x1_cont, x2_cont, **params) * self.localperiodickernel(x1_per, x2_per, **params)

class GlobalKernel(gpytorch.kernels.Kernel):
    is_stationary = True

    def __init__(self, matern_ard=None, lp_ard=None, **kwargs):
        super().__init__(**kwargs)

        # base kernel
        self.basekernel = BaseKernel(matern_ard=matern_ard, lp_ard=lp_ard)

        # scale kernel
        self.scalekernel = gpytorch.kernels.ScaleKernel(self.basekernel)

    
    def forward(self, x1, x2, **params):
        return self.scalekernel(x1, x2, **params)
    

class AirGP(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, matern_ard=None, lp_ard=None,):
        super(AirGP, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = GlobalKernel(matern_ard=matern_ard, lp_ard=lp_ard)

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Training

In [67]:
n_steps, n_stations = X.shape[:2]

In [85]:
Y_pred_all = []
Y_true_all = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training on", device)
for t in tqdm(range(n_steps)):
    i = np.random.randint(n_stations)
    # Leave one out split
    X_train = torch.from_numpy(np.concatenate((X[t, :i], X[t, i+1:]), axis=0)).to(device)
    X_test = torch.from_numpy(X[t, i:i+1]).to(device)
    Y_train = torch.from_numpy(np.concatenate((Y[t, :i], Y[t, i+1:]), axis=0)).to(device)
    Y_test = torch.from_numpy(Y[t, i:i+1]).to(device)
    # prepare training
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = AirGP(X_train, Y_train, likelihood, matern_ard=5, lp_ard=4).to(device)
    training_iter = 1000
    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    # training
    for iter in range(training_iter):
        model.train()
        likelihood.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = -mll(output, Y_train)
        loss.backward()
        optimizer.step()
    # evaluation
    model.eval()
    likelihood.eval()
    Y_pred = model(X_test)
    Y_pred_mean = Y_pred.mean.detach()
    Y_pred_all.append(Y_pred_mean.cpu().numpy())
    Y_true_all.append(Y_test.cpu().numpy())

Training on cuda


100%|██████████| 2208/2208 [7:16:41<00:00, 11.87s/it]  


In [86]:
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
print("RMSE:", RMSE)
print("CVRMSE:", CVRMSE)
print("MAE:", MAE)

RMSE: 4.925101109062394
CVRMSE: 0.3489731707483556
MAE: 3.2811890430160844


In [87]:
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("R2:", R2)

R2: 0.7779202693444107


In [51]:
model.train_inputs[0][29]

tensor([  10.0000,   23.0000,    2.0000,    0.0000, -119.7161,   36.8190,
              nan,       nan,       nan], dtype=torch.float64)

In [64]:
np.isnan(data).sum()

6624

In [63]:
data.shape

(2208, 36, 10)