In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from pykrige.ok import OrdinaryKriging
from pykrige.uk import UniversalKriging
from tqdm import tqdm

# Prepare Data

In [30]:
data_dir = "../../InterpolationBaseline/data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_oct = []
data_nov = []
data_dec = []

for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue

    #decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).mean().reset_index(drop=False)

    if len(df) < 24 * (31 + 30 + 31):
        continue
    else:
        df_oct = df.loc[df["month"] == 10]
        df_nov = df.loc[df["month"] == 11]
        df_dec = df.loc[df["month"] == 12]
        data_oct.append(df_oct.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
        data_nov.append(df_nov.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
        data_dec.append(df_dec.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())

data_oct = np.array(data_oct).transpose(1, 0, 2)
data_nov = np.array(data_nov).transpose(1, 0, 2)
data_dec = np.array(data_dec).transpose(1, 0, 2)
print(data_oct.shape, data_nov.shape, data_dec.shape)

One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(744, 36, 3) (720, 36, 3) (744, 36, 3)


In [31]:
data_dir = "../../InterpolationBaseline/data/Jan0124_Jan2924/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_jan = []
for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue

    #decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).mean().reset_index(drop=False)
    df = df.loc[df["day"] <= 30]
    
    if len(df) < 24 * 30:
        continue
    else:
        data_jan.append(df.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
data_jan = np.array(data_jan).transpose(1, 0, 2)
print(data_jan.shape)


One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(720, 45, 3)


# Interpolation Algorithm

In [32]:
# OK_model = OrdinaryKriging()

# LOOCV

## October

In [33]:
X_oct = data_oct[:, :, 1:]
Y_oct = data_oct[:, :, 0]
n_steps, n_stations = X_oct.shape[:2]
print(X_oct.shape, Y_oct.shape)

(744, 36, 2) (744, 36)


In [34]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        try:
            X_train = np.concatenate((X_oct[t, :i], X_oct[t, i+1:]), axis=0)
            X_test = X_oct[t, i:i+1]
            Y_train = np.concatenate((Y_oct[t, :i], Y_oct[t, i+1:]), axis=0)
            Y_test = Y_oct[t, i:i+1]
            OK_model = OrdinaryKriging(X_train[:, 0], X_train[:, 1], Y_train,
                                    variogram_model="linear",
                                    enable_plotting=False)
            Y_pred, Y_var = OK_model.execute("points", X_test[:, 0], X_test[:, 1])
            Y_true_all.append(Y_test)
            Y_pred_all.append(Y_pred)
        except:
            continue
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 744/744 [00:35<00:00, 21.16it/s]

RMSE:  3.6002536012484843
CVRMSE:  0.3267521435984961
MAE:  2.3871897606570833
R2:  0.6706947838540704





## November

In [35]:
X_nov = data_nov[:, :, 1:]
Y_nov = data_nov[:, :, 0]
n_steps, n_stations = X_nov.shape[:2]
print(X_nov.shape, Y_nov.shape)

(720, 36, 2) (720, 36)


In [36]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        try:
            X_train = np.concatenate((X_nov[t, :i], X_nov[t, i+1:]), axis=0)
            X_test = X_nov[t, i:i+1]
            Y_train = np.concatenate((Y_nov[t, :i], Y_nov[t, i+1:]), axis=0)
            Y_test = Y_nov[t, i:i+1]
            OK_model = OrdinaryKriging(X_train[:, 0], X_train[:, 1], Y_train,
                                    variogram_model="linear",
                                    enable_plotting=False)
            Y_pred, Y_var = OK_model.execute("points", X_test[:, 0], X_test[:, 1])
            Y_true_all.append(Y_test)
            Y_pred_all.append(Y_pred)
        except:
            continue
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 720/720 [00:39<00:00, 18.11it/s]

RMSE:  4.701286699479777
CVRMSE:  0.3596924395985581
MAE:  3.1333931108279893
R2:  0.6952090145515191





## December

In [37]:
X_dec = data_dec[:, :, 1:]
Y_dec = data_dec[:, :, 0]
n_steps, n_stations = X_dec.shape[:2]
print(X_dec.shape, Y_dec.shape)

(744, 36, 2) (744, 36)


In [38]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        try:
            X_train = np.concatenate((X_dec[t, :i], X_dec[t, i+1:]), axis=0)
            X_test = X_dec[t, i:i+1]
            Y_train = np.concatenate((Y_dec[t, :i], Y_dec[t, i+1:]), axis=0)
            Y_test = Y_dec[t, i:i+1]
            OK_model = OrdinaryKriging(X_train[:, 0], X_train[:, 1], Y_train,
                                    variogram_model="linear",
                                    enable_plotting=False)
            Y_pred, Y_var = OK_model.execute("points", X_test[:, 0], X_test[:, 1])
            Y_true_all.append(Y_test)
            Y_pred_all.append(Y_pred)
        except:
            continue
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 744/744 [00:47<00:00, 15.52it/s]

RMSE:  7.905469588444718
CVRMSE:  0.4529370169652949
MAE:  5.1627767620650955
R2:  0.6720821295649505





## January

In [39]:
X_jan = data_jan[:, :, 1:]
Y_jan = data_jan[:, :, 0]
n_steps, n_stations = X_jan.shape[:2]
print(X_jan.shape, Y_jan.shape)

(720, 45, 2) (720, 45)


In [40]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        try:
            X_train = np.concatenate((X_jan[t, :i], X_jan[t, i+1:]), axis=0)
            X_test = X_jan[t, i:i+1]
            Y_train = np.concatenate((Y_jan[t, :i], Y_jan[t, i+1:]), axis=0)
            Y_test = Y_jan[t, i:i+1]
            OK_model = OrdinaryKriging(X_train[:, 0], X_train[:, 1], Y_train,
                                    variogram_model="linear",
                                    enable_plotting=False)
            Y_pred, Y_var = OK_model.execute("points", X_test[:, 0], X_test[:, 1])
            Y_true_all.append(Y_test)
            Y_pred_all.append(Y_pred)
        except:
            continue
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 720/720 [00:46<00:00, 15.46it/s]

RMSE:  4.260484506414374
CVRMSE:  0.3738966201965613
MAE:  2.8058536357839152
R2:  0.7292893060294519



