In [1]:
import torch
import gpytorch
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# Prepare Data

In [28]:
data_dir = "../../InterpolationBaseline/data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_oct = []
data_nov = []
data_dec = []

for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0

    # remove outliers
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["weekday"] = df["timestamp"].dt.weekday
    
    df = df.loc[:, ["longitude", "latitude", "celsius", "humidity", "pressure",
                    "month", "day", "weekday", "hour", "pm25"]]
    df = df.groupby(["month", "day", "weekday", "hour"]).mean().reset_index(drop=False)

    if df.isnull().values.any():
        print("One nan dropped")
        continue

    if len(df) < 24 * (31 + 30 + 31):
        print("Missing data")
        continue
    else:
        df_oct = df.loc[df["month"] == 10]
        df_nov = df.loc[df["month"] == 11]
        df_dec = df.loc[df["month"] == 12]
        data_oct.append(df_oct.to_numpy())
        data_nov.append(df_nov.to_numpy())
        data_dec.append(df_dec.to_numpy())

data_oct = np.array(data_oct).transpose(1, 0, 2)
data_nov = np.array(data_nov).transpose(1, 0, 2)
data_dec = np.array(data_dec).transpose(1, 0, 2)
print(data_oct.shape, data_nov.shape, data_dec.shape)

One outlier dropped
One outlier dropped
Missing data
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
Missing data
One outlier dropped
Missing data
One outlier dropped
One outlier dropped
Missing data
One nan dropped
One nan dropped
One outlier dropped
(744, 35, 10) (720, 35, 10) (744, 35, 10)


In [40]:
data_dir = "../../InterpolationBaseline/data/Jan0124_Jan2924/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_jan = []
for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0

    # remove outliers
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["weekday"] = df["timestamp"].dt.weekday
    
    df = df.loc[:, ["longitude", "latitude", "celsius", "humidity", "pressure",
                    "month", "day", "weekday", "hour", "pm25"]]
    df = df.groupby(["month", "day", "weekday", "hour"]).mean().reset_index(drop=False)

    if df.isnull().values.any():
        print("One nan dropped")
        continue

    if len(df) < 24 * 30:
        print("Missing data")
        continue
    else:
        df_jan = df.loc[df["month"] == 1]
        data_jan.append(df_jan.to_numpy())
data_jan = np.array(data_jan).transpose(1, 0, 2)
print(data_jan.shape)

One outlier dropped
One outlier dropped
One outlier dropped
Missing data
One outlier dropped
One outlier dropped
One outlier dropped
One nan dropped
(722, 44, 10)


# Construct GP Model

In [29]:
class LocalPeriodicKernel(gpytorch.kernels.Kernel):
    is_stationary = True

    def __init__(self, lp_ard=None, **kwargs):
        super().__init__(**kwargs)
        if lp_ard is not None:
            self.periodickernel = gpytorch.kernels.PeriodicKernel(arg_num_dims=lp_ard)
            self.rbfkernel = gpytorch.kernels.RBFKernel(arg_num_dims=lp_ard)
        else:
            self.periodickernel = gpytorch.kernels.PeriodicKernel()
            self.rbfkernel = gpytorch.kernels.RBFKernel()
        self.localperiodickernel = self.periodickernel * self.rbfkernel

    #kernel function
    def forward(self, x1, x2, **params):
        return self.localperiodickernel(x1, x2, **params)
    
class BaseKernel(gpytorch.kernels.Kernel):
    def __init__(self, matern_ard=None, lp_ard=None, **kwargs):
        super().__init__(**kwargs)
        if matern_ard is not None:
            self.maternkernel = gpytorch.kernels.MaternKernel(nu=0.5,ard_num_dims=matern_ard)
        else:
            self.maternkernel = gpytorch.kernels.MaternKernel(nu=0.5)
        if lp_ard is not None:
            self.localperiodickernel = LocalPeriodicKernel(lp_ard=lp_ard)
        else:
            self.localperiodickernel = LocalPeriodicKernel()

    def forward(self, x1, x2, **params):
        # separate the input into continuous and periodic components
        x1_per = x1[:, :4]
        x1_cont = x1[:, 4:]
        x2_per = x2[:, :4]
        x2_cont = x2[:, 4:]
        return self.maternkernel(x1_cont, x2_cont, **params) * self.localperiodickernel(x1_per, x2_per, **params)

class GlobalKernel(gpytorch.kernels.Kernel):
    is_stationary = True

    def __init__(self, matern_ard=None, lp_ard=None, **kwargs):
        super().__init__(**kwargs)

        # base kernel
        self.basekernel = BaseKernel(matern_ard=matern_ard, lp_ard=lp_ard)

        # scale kernel
        self.scalekernel = gpytorch.kernels.ScaleKernel(self.basekernel)

    
    def forward(self, x1, x2, **params):
        return self.scalekernel(x1, x2, **params)
    
class AirGP(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, matern_ard=None, lp_ard=None):
        super(AirGP, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = GlobalKernel(matern_ard=matern_ard, lp_ard=lp_ard)

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Training

In [31]:
df_oct.head()

Unnamed: 0,month,day,weekday,hour,longitude,latitude,celsius,humidity,pressure,pm25
0,10,1,6,0,-119.77299,36.785336,25.383333,32.533333,996.053,4.099
1,10,1,6,1,-119.77299,36.785336,24.18,35.7,996.045667,4.245667
2,10,1,6,2,-119.77299,36.785336,22.35,40.2,996.142,4.696667
3,10,1,6,3,-119.77299,36.785336,20.76,43.2,996.502667,5.107667
4,10,1,6,4,-119.77299,36.785336,19.34,47.533333,996.841333,6.0375


## October

In [32]:
X_oct = data_oct[:, :, :-1]
Y_oct = data_oct[:, :, -1]
n_steps, n_stations = X_oct.shape[:2]
print(X_oct.shape, Y_oct.shape)

(744, 35, 9) (744, 35)


In [34]:
Y_pred_all = []
Y_true_all = []
for t in tqdm(range(n_steps)):
    # leave one out split
    i = np.random.randint(n_stations)
    X_train = torch.from_numpy(np.concatenate((X_oct[t, :i], X_oct[t, i+1:]), axis=0)).float()
    X_test = torch.from_numpy(X_oct[t, i:i+1]).float()
    Y_train = torch.from_numpy(np.concatenate((Y_oct[t, :i], Y_oct[t, i+1:]), axis=0)).float()
    Y_test = torch.from_numpy(Y_oct[t, i:i+1]).float()

    # prepare training
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = AirGP(X_train, Y_train, likelihood, matern_ard=5, lp_ard=4)
    training_iter = 1000
    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # training
    for iter in range(training_iter):
        model.train()
        likelihood.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = -mll(output, Y_train)
        loss.backward()
        optimizer.step()

    # evaluation
    model.eval()
    likelihood.eval()
    Y_pred = model(X_test)
    Y_pred_mean = Y_pred.mean.detach()
    Y_pred_all.append(Y_pred_mean.cpu().numpy())
    Y_true_all.append(Y_test.cpu().numpy())

  0%|          | 0/744 [00:00<?, ?it/s]

100%|██████████| 744/744 [24:31<00:00,  1.98s/it]


In [35]:
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

RMSE:  3.2496684
CVRMSE:  0.29804638
MAE:  2.3173354
R2:  0.7274735271930695


## November

In [36]:
X_nov = data_nov[:, :, :-1]
Y_nov = data_nov[:, :, -1]
n_steps, n_stations = X_nov.shape[:2]
print(X_nov.shape, Y_nov.shape)

(720, 35, 9) (720, 35)


In [37]:
Y_pred_all = []
Y_true_all = []
for t in tqdm(range(n_steps)):
    # leave one out split
    i = np.random.randint(n_stations)
    X_train = torch.from_numpy(np.concatenate((X_nov[t, :i], X_nov[t, i+1:]), axis=0)).float()
    X_test = torch.from_numpy(X_nov[t, i:i+1]).float()
    Y_train = torch.from_numpy(np.concatenate((Y_nov[t, :i], Y_nov[t, i+1:]), axis=0)).float()
    Y_test = torch.from_numpy(Y_nov[t, i:i+1]).float()

    # prepare training
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = AirGP(X_train, Y_train, likelihood, matern_ard=5, lp_ard=4)
    training_iter = 1000
    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # training
    for iter in range(training_iter):
        model.train()
        likelihood.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = -mll(output, Y_train)
        loss.backward()
        optimizer.step()

    # evaluation
    model.eval()
    likelihood.eval()
    Y_pred = model(X_test)
    Y_pred_mean = Y_pred.mean.detach()
    Y_pred_all.append(Y_pred_mean.cpu().numpy())
    Y_true_all.append(Y_test.cpu().numpy())

Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 720/720 [23:41<00:00,  1.97s/it]

RMSE:  4.1415052
CVRMSE:  0.31843826
MAE:  2.8648872
R2:  0.745926171541214





# December

In [38]:
X_dec = data_dec[:, :, :-1]
Y_dec = data_dec[:, :, -1]
n_steps, n_stations = X_dec.shape[:2]
print(X_dec.shape, Y_dec.shape)

(744, 35, 9) (744, 35)


In [42]:
Y_pred_all = []
Y_true_all = []
for t in tqdm(range(n_steps)):
    try:
        # leave one out split
        i = np.random.randint(n_stations)
        X_train = torch.from_numpy(np.concatenate((X_dec[t, :i], X_dec[t, i+1:]), axis=0)).float()
        X_test = torch.from_numpy(X_dec[t, i:i+1]).float()
        Y_train = torch.from_numpy(np.concatenate((Y_dec[t, :i], Y_dec[t, i+1:]), axis=0)).float()
        Y_test = torch.from_numpy(Y_dec[t, i:i+1]).float()

        # prepare training
        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = AirGP(X_train, Y_train, likelihood, matern_ard=5, lp_ard=4)
        training_iter = 1000
        model.train()
        likelihood.train()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        # training
        for iter in range(training_iter):
            model.train()
            likelihood.train()
            optimizer.zero_grad()
            output = model(X_train)
            loss = -mll(output, Y_train)
            loss.backward()
            optimizer.step()

        # evaluation
        model.eval()
        likelihood.eval()
        Y_pred = model(X_test)
        Y_pred_mean = Y_pred.mean.detach()
        Y_pred_all.append(Y_pred_mean.cpu().numpy())
        Y_true_all.append(Y_test.cpu().numpy())
    except:
        continue

Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 744/744 [24:37<00:00,  1.99s/it]

RMSE:  7.198174
CVRMSE:  0.40572974
MAE:  4.5322123
R2:  0.7331408858299255





## January

In [43]:
X_jan = data_jan[:, :, :-1]
Y_jan = data_jan[:, :, -1]
n_steps, n_stations = X_jan.shape[:2]
print(X_jan.shape, Y_jan.shape)

(722, 44, 9) (722, 44)


In [44]:
Y_pred_all = []
Y_true_all = []
for t in tqdm(range(n_steps)):
    try:
        # leave one out split
        i = np.random.randint(n_stations)
        X_train = torch.from_numpy(np.concatenate((X_jan[t, :i], X_jan[t, i+1:]), axis=0)).float()
        X_test = torch.from_numpy(X_jan[t, i:i+1]).float()
        Y_train = torch.from_numpy(np.concatenate((Y_jan[t, :i], Y_jan[t, i+1:]), axis=0)).float()
        Y_test = torch.from_numpy(Y_jan[t, i:i+1]).float()

        # prepare training
        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = AirGP(X_train, Y_train, likelihood, matern_ard=5, lp_ard=4)
        training_iter = 1000
        model.train()
        likelihood.train()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        # training
        for iter in range(training_iter):
            model.train()
            likelihood.train()
            optimizer.zero_grad()
            output = model(X_train)
            loss = -mll(output, Y_train)
            loss.backward()
            optimizer.step()

        # evaluation
        model.eval()
        likelihood.eval()
        Y_pred = model(X_test)
        Y_pred_mean = Y_pred.mean.detach()
        Y_pred_all.append(Y_pred_mean.cpu().numpy())
        Y_true_all.append(Y_test.cpu().numpy())
    except:
        continue

Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 722/722 [24:49<00:00,  2.06s/it]

RMSE:  4.826853
CVRMSE:  0.40980184
MAE:  2.9290924
R2:  0.680124431848526



