In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# Prepare Data

In [20]:
data_dir = "../../InterpolationBaseline/data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_oct = []
data_nov = []
data_dec = []

for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue

    #decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).mean().reset_index(drop=False)

    if len(df) < 24 * (31 + 30 + 31):
        continue
    else:
        df_oct = df.loc[df["month"] == 10]
        df_nov = df.loc[df["month"] == 11]
        df_dec = df.loc[df["month"] == 12]
        data_oct.append(df_oct.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
        data_nov.append(df_nov.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
        data_dec.append(df_dec.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())

data_oct = np.array(data_oct).transpose(1, 0, 2)
data_nov = np.array(data_nov).transpose(1, 0, 2)
data_dec = np.array(data_dec).transpose(1, 0, 2)
print(data_oct.shape, data_nov.shape, data_dec.shape)

One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(744, 36, 3) (720, 36, 3) (744, 36, 3)


In [21]:
data_dir = "../../InterpolationBaseline/data/Jan0124_Jan2924/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data_jan = []
for file in data_files:
    df = pd.read_csv(data_dir + file)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue

    #decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).mean().reset_index(drop=False)
    df = df.loc[df["day"] <= 30]
    
    if len(df) < 24 * 30:
        continue
    else:
        data_jan.append(df.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
data_jan = np.array(data_jan).transpose(1, 0, 2)
print(data_jan.shape)


One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(720, 45, 3)


# Interpolation Algorithm

In [22]:
class IDW:
    def __init__(self, X, Y):
        """
        X: (n, d), n is the number of samples, d is the dimension of feature vectors
        Y: (n, ), n is the number of samples
        """
        self.X = X
        self.Y = Y

    def predict(self, X_test, p=2):
        """
        X_test: (m, d), m is the number of test samples, d is the dimension of feature vectors
        p: the power of distance
        """
        # construct distance matrix
        dist_matrix = np.zeros((X_test.shape[0], self.X.shape[0]))
        for i in range(X_test.shape[0]):
            for j in range(self.X.shape[0]):
                dist = np.linalg.norm(X_test[i] - self.X[j])
                dist_matrix[i, j] = dist
        
        # construct weight matrix
        weight_matrix = 1 / np.power(dist_matrix, p)

        # normalize weight matrix
        weight_matrix = weight_matrix / np.sum(weight_matrix, axis=1, keepdims=True)
        self.weight_matrix = weight_matrix

        # predict
        Y_pred = np.matmul(weight_matrix, self.Y)
        return Y_pred


# LOOCV

## October

In [31]:
X_oct = data_oct[:, :, 1:]
Y_oct = data_oct[:, :, 0]
n_steps, n_stations = X_oct.shape[:2]
print(X_oct.shape, Y_oct.shape)

(744, 36, 2) (744, 36)


In [32]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X_oct[t, :i], X_oct[t, i+1:]), axis=0)
        X_test = X_oct[t, i:i+1]
        Y_train = np.concatenate((Y_oct[t, :i], Y_oct[t, i+1:]), axis=0)
        Y_test = Y_oct[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 744/744 [00:01<00:00, 394.49it/s]


RMSE:  3.3335418163432133
CVRMSE:  0.3025458911804561
MAE:  2.0653108365228388
R2:  0.717678314828415


## November

In [25]:
X_nov = data_nov[:, :, 1:]
Y_nov = data_nov[:, :, 0]
n_steps, n_stations = X_nov.shape[:2]
print(X_nov.shape, Y_nov.shape)

(720, 36, 2) (720, 36)


In [26]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X_nov[t, :i], X_nov[t, i+1:]), axis=0)
        X_test = X_nov[t, i:i+1]
        Y_train = np.concatenate((Y_nov[t, :i], Y_nov[t, i+1:]), axis=0)
        Y_test = Y_nov[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 720/720 [00:02<00:00, 359.62it/s]

RMSE:  4.261256396753973
CVRMSE:  0.32602600247140945
MAE:  2.761774241755152
R2:  0.7495944345765306





## December

In [27]:
X_dec = data_dec[:, :, 1:]
Y_dec = data_dec[:, :, 0]
n_steps, n_stations = X_dec.shape[:2]
print(X_dec.shape, Y_dec.shape)

(744, 36, 2) (744, 36)


In [28]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X_dec[t, :i], X_dec[t, i+1:]), axis=0)
        X_test = X_dec[t, i:i+1]
        Y_train = np.concatenate((Y_dec[t, :i], Y_dec[t, i+1:]), axis=0)
        Y_test = Y_dec[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 744/744 [00:02<00:00, 347.31it/s]

RMSE:  6.856705585857254
CVRMSE:  0.39284899391769723
MAE:  4.33583326134312
R2:  0.7533161349080264





## January

In [29]:
X_jan = data_jan[:, :, 1:]
Y_jan = data_jan[:, :, 0]
n_steps, n_stations = X_jan.shape[:2]
print(X_jan.shape, Y_jan.shape)

(720, 45, 2) (720, 45)


In [30]:
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X_jan[t, :i], X_jan[t, i+1:]), axis=0)
        X_test = X_jan[t, i:i+1]
        Y_train = np.concatenate((Y_jan[t, :i], Y_jan[t, i+1:]), axis=0)
        Y_test = Y_jan[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

100%|██████████| 720/720 [00:03<00:00, 235.69it/s]

RMSE:  4.0195062824579715
CVRMSE:  0.3527485692313207
MAE:  2.570933438924726
R2:  0.7590467129948636



