In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from pykrige.ok import OrdinaryKriging
from pykrige.uk import UniversalKriging
from tqdm import tqdm

# Prepare Data

In [2]:
data_dir = "../data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).median().reset_index(drop=False)
    if len(df) < ((31 + 30 + 31) * 24):
        continue
    elif df.isnull().values.any():
        continue
    else:
        data.append(df.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
data = np.array(data).transpose(1, 0, 2)
X = data[:, :, 1:]
Y = data[:, :, 0]
print(X.shape, Y.shape)


One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(2208, 36, 2) (2208, 36)


# Interpolation Algorithm

In [3]:
# OK_model = OrdinaryKriging()

# LOOCV

In [4]:
n_steps = X.shape[0]
n_stations = X.shape[1]

In [5]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        try:
            X_train = np.concatenate((X[t, :i], X[t, i+1:]), axis=0)
            X_test = X[t, i:i+1]
            Y_train = np.concatenate((Y[t, :i], Y[t, i+1:]), axis=0)
            Y_test = Y[t, i:i+1]
            OK_model = UniversalKriging(X_train[:, 0], X_train[:, 1], Y_train,
                                    variogram_model="linear",
                                    enable_plotting=False)
            Y_pred, Y_var = OK_model.execute("points", X_test[:, 0], X_test[:, 1])
            Y_true_all.append(Y_test)
            Y_pred_all.append(Y_pred)
        except:
            continue
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)

100%|██████████| 2208/2208 [09:29<00:00,  3.87it/s]

RMSE:  5.5056138005880175
CVRMSE:  0.4025935553742296
MAE:  3.4770695768816977





In [6]:
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("R2: ", R2)

R2:  0.7130600686092154
