In [147]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

# Prepare Data

In [11]:
def average_hour(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour
    df = df[['year', 'month', 'day', 'hour', 'pm25', 'latitude', 'longitude']]
    df = df.groupby(['year', 'month', 'day', 'hour']).mean()
    df = df.reset_index(drop=True)
    return df

In [16]:
data_dir = "../data/Jan1524_Jan2224/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    df.loc[df["pm25"] < 0, 'pm25'] = 0
    df = average_hour(df)
    data.append(df.to_numpy())
data = np.array(data)
print(data.shape)

(8, 168, 3)


In [40]:
class Geo_LSTM_Dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    
def construct_dataloader(data, test_index, window_size=24, batch_size=16):
    data_train = np.concatenate([data[:test_index], data[test_index+1:]], axis=0)
    data_test = data[test_index:test_index+1]
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    # construct training dataset
    for label_index in range(len(data_train)):
        X = np.concatenate([data_train[:label_index], data[label_index+1:]], axis=0)
        Y = data_train[label_index:label_index+1]
        RLat = torch.from_numpy(Y[0, 0, 1] - X[:, 0, 1])
        RLon = torch.from_numpy(Y[0, 0, 2] - X[:, 0, 2])
        for t in range(window_size-1, X.shape[1]):
            history_readings = torch.from_numpy(X[:, t-window_size+1:t+1, 0])
            target_reading = Y[0, t, 0]
            X_train.append((history_readings, RLat, RLon))
            Y_train.append(target_reading)
    Y_train = torch.tensor(Y_train)
    train_dataset = Geo_LSTM_Dataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Construct testing dataset
    RLat = torch.from_numpy(data_test[0, 0, 1] - data_train[:, 0, 1])
    RLon = torch.from_numpy(data_test[0, 0, 2] - data_train[:, 0, 2])
    for t in range(window_size-1, data_test.shape[1]):
        history_readings = torch.from_numpy(data_train[:, t-window_size+1:t+1, 0])
        target_reading = data_test[0, t, 0]
        X_test.append((history_readings, RLat, RLon))
        Y_test.append(target_reading)
    Y_test = torch.tensor(Y_test)
    test_dataset = Geo_LSTM_Dataset(X_test, Y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [125]:
train_loader, test_loader = construct_dataloader(data, 0)

# Construct Model

In [143]:
class Geo_Layer(nn.Module):
    def __init__(self, K=4):
        super(Geo_Layer, self).__init__()
        self.K = K

    def forward(self, X):
        # history_readings: (batch_size, n_stations, window_size)
        # RLat: (batch_size, n_stations)
        # RLon: (batch_size, n_stations)
        history_readings, RLat, RLon = X
        batch_size, n_stations, window_size = history_readings.shape

        # RDist, Rank, R_A: (batch_size, n_stations)
        RDist = torch.sqrt(RLat**2 + RLon**2)
        indice = torch.argsort(RDist)[:, :self.K]   # (batch_size, K)
        nearby_readings = history_readings[torch.arange(batch_size)[:, None], indice]

        return nearby_readings
    
class Geo_LSTM(nn.Module):
    def __init__(self, K=4, num_layers=4, hidden_size=128, fc_hidden=1024):
        super(Geo_LSTM, self).__init__()
        self.geo_layer = Geo_Layer(K)
        self.lstm = nn.LSTM(input_size=K, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        self.fc = nn.Sequential(*[
            nn.Linear(hidden_size, fc_hidden),
            nn.ReLU(),
            nn.Linear(fc_hidden, 1)
        ])

    def forward(self, X):
        # nearby_readings: (batch_size, window_size, K)
        nearby_readings = self.geo_layer(X).permute(0, 2, 1).float()
        # output: (batch_size, window_size, hidden_size) -> (batch_size, 1)
        output, _ = self.lstm(nearby_readings)
        output = self.fc(output[:, -1, :]).squeeze()
        return output


# LOOCV

In [148]:
batch_size = 64
epochs = 1000
lr = 1e-3
for test_idx in range(len(data_files)):
    geo_lstm = Geo_LSTM()
    criterion = nn.MSELoss()
    optimizer = torch.optim.RMSprop(geo_lstm.parameters(), lr=lr)
    train_loader, test_loader = construct_dataloader(data, test_idx, batch_size=batch_size)
    best_loss = 1e10
    save_path = f"./model_weights/geolstm_{test_idx}.pt"
    for epoch in tqdm(range(epochs)):
        for X, Y in train_loader:
            optimizer.zero_grad()
            Y_pred = geo_lstm(X)
            loss = criterion(Y_pred, Y.float())
            loss.backward()
            optimizer.step()
    
        # Evaluate on train and test set
        with torch.no_grad():
            train_loss = 0
            for X, Y in train_loader:
                Y_pred = geo_lstm(X)
                loss = criterion(Y_pred, Y.float())
                train_loss += loss.item()
            train_loss /= len(train_loader)

        # save if the model is the best
        if train_loss < best_loss:
            best_loss = train_loss
            torch.save(geo_lstm.state_dict(), save_path)


100%|██████████| 1000/1000 [16:48<00:00,  1.01s/it]
100%|██████████| 1000/1000 [16:53<00:00,  1.01s/it]
100%|██████████| 1000/1000 [16:52<00:00,  1.01s/it]
100%|██████████| 1000/1000 [17:14<00:00,  1.03s/it]
100%|██████████| 1000/1000 [17:24<00:00,  1.04s/it]
100%|██████████| 1000/1000 [16:57<00:00,  1.02s/it]
100%|██████████| 1000/1000 [17:40<00:00,  1.06s/it]
100%|██████████| 1000/1000 [17:10<00:00,  1.03s/it]


In [149]:
RMSEs = []
for test_idx in range(len(data_files)):
    geo_lstm = Geo_LSTM()
    geo_lstm.load_state_dict(torch.load(f"./model_weights/geolstm_{test_idx}.pt"))
    geo_lstm.eval()
    residuals = []
    for X, Y in test_loader:
        Y_pred = geo_lstm(X)
        residuals.append(Y_pred - Y)
    residuals = torch.cat(residuals, dim=0)
    RMSE = torch.sqrt(torch.mean(residuals**2)).item()
    RMSEs.append(RMSE)
print(np.mean(RMSEs))

2.2678692654240002
