In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from tqdm import tqdm

# Prepare Data

In [3]:
data_dir = "../data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).median().reset_index(drop=False)
    if len(df) < ((31 + 30 + 31) * 24):
        continue
    else:
        data.append(df.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
data = np.array(data).transpose(1, 0, 2)
X = data[:, :, 1:]
Y = data[:, :, 0]
print(X.shape, Y.shape)


One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(2208, 36, 2) (2208, 36)


In [4]:
def pm25_aqi(pm25):
    pm25 = np.clip(pm25, 0, 500)
    breakpoints = np.array([
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 350.4, 301, 400),
        (350.5, 500.4, 401, 500)
    ])

    aqi = np.zeros_like(pm25)

    for i, (bp_lo, bp_hi, aqi_lo, aqi_hi) in enumerate(breakpoints):
        mask = (pm25 >= bp_lo) & (pm25 <= bp_hi)
        aqi[mask] = ((aqi_hi - aqi_lo) / (bp_hi - bp_lo)) * (pm25[mask] - bp_lo) + aqi_lo

    return aqi

Y = pm25_aqi(Y)

# Interpolation Algorithm

In [5]:
class IDW:
    def __init__(self, X, Y):
        """
        X: (n, d), n is the number of samples, d is the dimension of feature vectors
        Y: (n, ), n is the number of samples
        """
        self.X = X
        self.Y = Y

    def predict(self, X_test, p=2):
        """
        X_test: (m, d), m is the number of test samples, d is the dimension of feature vectors
        p: the power of distance
        """
        # construct distance matrix
        dist_matrix = np.zeros((X_test.shape[0], self.X.shape[0]))
        for i in range(X_test.shape[0]):
            for j in range(self.X.shape[0]):
                dist = np.linalg.norm(X_test[i] - self.X[j])
                dist_matrix[i, j] = dist
        
        # construct weight matrix
        weight_matrix = 1 / np.power(dist_matrix, p)

        # normalize weight matrix
        weight_matrix = weight_matrix / np.sum(weight_matrix, axis=1, keepdims=True)
        self.weight_matrix = weight_matrix

        # predict
        Y_pred = np.matmul(weight_matrix, self.Y)
        return Y_pred


# LOOCV

In [6]:
n_steps = X.shape[0]
n_stations = X.shape[1]

In [49]:
# concentration
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X[t, :i], X[t, i+1:]), axis=0)
        X_test = X[t, i:i+1]
        Y_train = np.concatenate((Y[t, :i], Y[t, i+1:]), axis=0)
        Y_test = Y[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)

100%|██████████| 2208/2208 [00:05<00:00, 377.71it/s]

RMSE:  4.853051776119763
CVRMSE:  0.35487548522829593
MAE:  2.991607173178393





In [7]:
# aqi
Y_true_all = []
Y_pred_all = []
for t in tqdm(range(n_steps)):
    for i in range(n_stations):
        X_train = np.concatenate((X[t, :i], X[t, i+1:]), axis=0)
        X_test = X[t, i:i+1]
        Y_train = np.concatenate((Y[t, :i], Y[t, i+1:]), axis=0)
        Y_test = Y[t, i:i+1]
        idw = IDW(X_train, Y_train)
        Y_pred = idw.predict(X_test)
        Y_true_all.append(Y_test)
        Y_pred_all.append(Y_pred)
Y_true_all = np.array(Y_true_all).flatten()
Y_pred_all = np.array(Y_pred_all).flatten()
RMSE = np.sqrt(np.mean((Y_true_all - Y_pred_all) ** 2))
CVRMSE = RMSE / np.mean(Y_true_all)
MAE = np.mean(np.abs(Y_true_all - Y_pred_all))
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)

100%|██████████| 2208/2208 [00:11<00:00, 184.47it/s]

RMSE:  13.704450336492812
CVRMSE:  0.2884340075370616
MAE:  8.905414827367949





In [8]:
R2 = 1 - np.sum((Y_true_all - Y_pred_all) ** 2) / np.sum((Y_true_all - np.mean(Y_true_all)) ** 2)
print("R2: ", R2)

R2:  0.7697998854678998


In [38]:
Y_true_all

array([5.68516667, 2.67      , 3.84666667, ..., 0.21166667, 3.27816667,
       3.22383333])

In [32]:
Y.mean(axis=0)

array([ 16.6798193 , 902.18115025,   7.64678734,  11.61844712,
        22.77791814,  18.93206912,  16.26923575,  17.32546335,
         9.42348394,  10.76318911,  19.36816875,   4.14625241,
        17.90672532,  12.91898548,   7.38015037,   9.3278031 ,
        82.37245287,  11.30184514, 590.73535732,   9.67561553,
        25.57678336,  16.17250235,  10.91730206,  17.62321675,
        11.80512267,  17.55065199,  21.68182205,   9.7346338 ,
        15.84781823,  13.98323052,   8.17775115,   8.64057832,
        18.02855063,  16.04044267,  13.87518255,  16.96457645,
        19.56981861,  14.28303496,   9.2633809 ,  21.17678151,
        11.60555379,   6.60548552,  19.34883452,  10.00523058,
        16.74047251,  33.71669461])

In [26]:
data

array([[[ 5.68516667e+00, -1.19792320e+02,  3.67533530e+01],
        [ 8.63651500e+02, -1.19727820e+02,  3.68580670e+01],
        [ 2.67000000e+00, -1.19775700e+02,  3.68681640e+01],
        ...,
        [ 3.30100000e+00, -1.19716060e+02,  3.68189500e+01],
        [ 4.09900000e+00, -1.19772990e+02,  3.67853360e+01],
        [ 3.89216667e+00, -1.19751150e+02,  3.67928280e+01]],

       [[ 4.31883333e+00, -1.19792320e+02,  3.67533530e+01],
        [ 8.63399167e+02, -1.19727820e+02,  3.68580670e+01],
        [ 2.67000000e+00, -1.19775700e+02,  3.68681640e+01],
        ...,
        [ 5.87516667e+00, -1.19716060e+02,  3.68189500e+01],
        [ 4.24566667e+00, -1.19772990e+02,  3.67853360e+01],
        [ 4.09850000e+00, -1.19751150e+02,  3.67928280e+01]],

       [[ 8.63983333e+00, -1.19792320e+02,  3.67533530e+01],
        [ 8.63041000e+02, -1.19727820e+02,  3.68580670e+01],
        [ 2.67083333e+00, -1.19775700e+02,  3.68681640e+01],
        ...,
        [ 4.02900000e+00, -1.19716060e+02,

In [29]:
df

Unnamed: 0,year,month,day,hour,pm25,longitude,latitude
0,2023,10,1,0,3.892167,-119.75115,36.792828
1,2023,10,1,1,4.098500,-119.75115,36.792828
2,2023,10,1,2,4.566167,-119.75115,36.792828
3,2023,10,1,3,5.230167,-119.75115,36.792828
4,2023,10,1,4,512.901333,-119.75115,36.792828
...,...,...,...,...,...,...,...
2203,2023,12,31,19,2.034667,-119.75115,36.792828
2204,2023,12,31,20,1.690833,-119.75115,36.792828
2205,2023,12,31,21,2.222167,-119.75115,36.792828
2206,2023,12,31,22,2.699833,-119.75115,36.792828


In [30]:
data[0,0,:]

array([   5.68516667, -119.79232   ,   36.753353  ])