In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import mean_squared_error

from model import AttentionModel

In [5]:
dataset = pd.read_csv('Train_Cleaned_KNN_Filtered.csv')

In [6]:
class NO2Dataset(Dataset):
    def __init__(self, df, max_days=15):
        self.max_days = max_days
        self.data = df.sort_values(by=['LAT', 'LON', 'Date']).reset_index(drop=True)
        self.data['Date'] = pd.to_datetime(self.data['Date'])
        self.locations = self.data.groupby(['LAT', 'LON']).groups
        self.location_keys = list(self.locations.keys())
        self.samples = [(loc, idx) for loc in self.location_keys for idx in self.locations[loc]]

    def __len__(self):
        return len(self.samples)


    def __getitem__(self, idx):
        location, index = self.samples[idx]
        location_data = self.data.loc[self.locations[location]].reset_index(drop=True)

        location_data['Date'] = pd.to_datetime(location_data['Date'], errors='coerce')
        location_data = location_data.dropna(subset=['Date'])

        for i in range(len(location_data)):
            current_date = location_data.loc[i, 'Date']
            start_date = current_date - pd.DateOffset(days=self.max_days)
            end_date = current_date

            past_data = location_data[(location_data['Date'] >= start_date) & (location_data['Date'] < end_date)]

            if len(past_data) < self.max_days:
                num_padding_days = self.max_days - len(past_data)
                last_date = past_data['Date'].min() if not past_data.empty else pd.Timestamp.now()

                padding_data = pd.DataFrame({
                    'Date': [last_date - pd.DateOffset(days=i+1) for i in range(num_padding_days)]
                }).reindex(columns=past_data.columns, fill_value=0)

                past_data = pd.concat([padding_data, past_data], ignore_index=True)

            past_data = past_data.sort_values(by='Date').reset_index(drop=True)

            # Extract features
            features_tensor = torch.tensor( past_data[['LST', 'AAI', 'CloudFraction', 'Precipitation', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']].values, dtype=torch.float32)
            lat = torch.tensor(past_data[['LAT']].values, dtype=torch.float32)
            lon = torch.tensor(past_data[['LON']].values, dtype=torch.float32)

            gt = torch.tensor(past_data[['GT_NO2']].values, dtype=torch.float32)

            # Return tensors for this sample
            return features_tensor, lat, lon, gt


def collate_fn(batch):
    features_list, lat_list, lon_list, gt_list = zip(*batch)

    # Stack tensors to create batch
    features_tensor_batch = torch.stack(features_list)
    lat_tensor_batch = torch.stack(lat_list)
    lon_tensor_batch = torch.stack(lon_list)
    gt_batch = torch.stack(gt_list)

    return features_tensor_batch, lat_tensor_batch, lon_tensor_batch, gt_batch

In [7]:
batch_size = 8

no2_dataset = NO2Dataset(dataset)
train_size = int(0.8 * len(no2_dataset))
val_size = len(no2_dataset) - train_size

train_dataset, val_dataset = random_split(no2_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [8]:
for features, lat, lon, gt in train_loader:
    print(features.shape, lat.shape, lon.shape, gt.shape)
    break

torch.Size([8, 15, 8]) torch.Size([8, 15, 1]) torch.Size([8, 15, 1]) torch.Size([8, 15, 1])


In [5]:
class RMSLoss(nn.Module):
    def __init__(self):
        super(RMSLoss, self).__init__()

    def forward(self, predictions, targets):
        # Compute Mean Squared Error (MSE)
        mse = torch.mean((predictions - targets) ** 2)
        # Return the square root of MSE
        rms = torch.sqrt(mse)
        return rms

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [7]:
def train_one_epoch(epoch_index, tb_writer, model, criterion, optimizer):
    model.train()  # Set the model to training mode
    model.to(device)  # Move model to the GPU
    running_loss = 0.0
    last_loss = 0.0

    for i, (features_seq, lat, lon, gt) in enumerate(train_loader):
        # Move data to GPU
        features_seq, lat, lon, gt = features_seq.to(device), lat.to(device), lon.to(device), gt.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(features_seq, lat, lon)
        loss = criterion(outputs.squeeze(), gt)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 99 == 0:
            last_loss = running_loss / 100  # Average loss over 100 batches
            print(f'Epoch {epoch_index}, Batch {i+1}, Loss: {last_loss}')

            # TensorBoard logging
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)

            running_loss = 0.0

    return last_loss

def validate_one_epoch(epoch_index, tb_writer, model, criterion):
    model.eval()  # Set the model to evaluation mode
    model.to(device)  # Move model to the GPU
    val_loss = 0.0
    all_preds = []
    all_gts = []

    with torch.no_grad():
        for _, (features_seq, lat, lon, gt) in enumerate(val_loader):
            # Move data to GPU
            features_seq, lat, lon, gt = features_seq.to(device), lat.to(device), lon.to(device), gt.to(device)

            val_outputs = model(features_seq, lat, lon)
            loss = criterion(val_outputs.squeeze(), gt)
            val_loss += loss.item()

            # Collect predictions and ground truths for RMSE calculation
            all_preds.append(val_outputs.squeeze().cpu().numpy())
            all_gts.append(gt.cpu().numpy())

    all_preds = np.concatenate(all_preds)
    all_gts = np.concatenate(all_gts)
    val_rmse = np.sqrt(np.mean((all_preds - all_gts) ** 2))

    print(f'Epoch {epoch_index}, Validation RMSE: {val_rmse:.4f}')
    tb_writer.add_scalar('RMSE/validation', val_rmse, epoch_index)

    return val_loss

In [8]:
model = AttentionModel()

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters in the model: {total_params}')

Total number of parameters in the model: 146192


In [10]:
model = AttentionModel()
criterion = RMSLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

writer = SummaryWriter(log_dir="./trained_models/train-runs")

num_epochs = 10
best_val_loss = float('inf')

for epoch in tqdm(range(num_epochs)):

    # Train for one epoch
    avg_train_loss = train_one_epoch(epoch, writer, model, criterion, optimizer)
    print(f'Epoch {epoch+1} training completed, Avg Loss: {avg_train_loss:.4f}')
    torch.save(model.state_dict(), './trained_models/latest_Att-CNN-LSTM_model.pt')
    print(f'Latest Model Saved{epoch+1}')

    # Validate after each epoch
    avg_val_loss = validate_one_epoch(epoch, writer, model, criterion)

    # Save the model if it has the best validation loss so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), './trained_models/best_Att-CNN-LSTM_model.pt')
        print(f'Model saved at epoch {epoch+1} with validation RMSE {best_val_loss:.4f}')

# Close the TensorBoard writer
writer.close()

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0, Batch 1, Loss: 0.0031827571988105774
Epoch 0, Batch 100, Loss: 0.17591502249240876
Epoch 0, Batch 199, Loss: 0.08625438086688518
Epoch 0, Batch 298, Loss: 0.05008761234581471
Epoch 0, Batch 397, Loss: 0.03302999973297119
Epoch 0, Batch 496, Loss: 0.02376102942973375
Epoch 0, Batch 595, Loss: 0.01804205322638154
Epoch 0, Batch 694, Loss: 0.014225746178999544
Epoch 0, Batch 793, Loss: 0.011534760892391204
Epoch 0, Batch 892, Loss: 0.009557036804035307
Epoch 0, Batch 991, Loss: 0.008055639597587287
Epoch 0, Batch 1090, Loss: 0.00688572645187378
Epoch 0, Batch 1189, Loss: 0.005954342833720147
Epoch 0, Batch 1288, Loss: 0.0051994227292016144
Epoch 0, Batch 1387, Loss: 0.004578120252117515
Epoch 0, Batch 1486, Loss: 0.004060045680962503
Epoch 0, Batch 1585, Loss: 0.0036231012875214217
Epoch 0, Batch 1684, Loss: 0.003250891463831067
Epoch 0, Batch 1783, Loss: 0.002931013717316091
Epoch 0, Batch 1882, Loss: 0.002653953970875591
Epoch 0, Batch 1981, Loss: 0.0024122981494292617
Epoch 0,

 10%|█         | 1/10 [03:39<32:53, 219.28s/it]

Epoch 0, Validation RMSE: 0.0001
Model saved at epoch 1 with validation RMSE 0.1131
Epoch 1, Batch 1, Loss: 5.554382369155065e-07
Epoch 1, Batch 100, Loss: 5.3607349618687296e-05
Epoch 1, Batch 199, Loss: 5.0966025264642666e-05
Epoch 1, Batch 298, Loss: 4.8457377088197975e-05
Epoch 1, Batch 397, Loss: 4.6075235986791084e-05
Epoch 1, Batch 496, Loss: 4.381217226182344e-05
Epoch 1, Batch 595, Loss: 4.1662848234409465e-05
Epoch 1, Batch 694, Loss: 3.962047445384087e-05
Epoch 1, Batch 793, Loss: 3.768037069676211e-05
Epoch 1, Batch 892, Loss: 3.583659410651308e-05
Epoch 1, Batch 991, Loss: 3.408459106140072e-05
Epoch 1, Batch 1090, Loss: 3.2419507297163365e-05
Epoch 1, Batch 1189, Loss: 3.08369290723931e-05
Epoch 1, Batch 1288, Loss: 2.933271853180486e-05
Epoch 1, Batch 1387, Loss: 2.7902787314815215e-05
Epoch 1, Batch 1486, Loss: 2.6543442527326987e-05
Epoch 1, Batch 1585, Loss: 2.52510830614483e-05
Epoch 1, Batch 1684, Loss: 2.402246807832853e-05
Epoch 1, Batch 1783, Loss: 2.285423057401

### Testing

In [None]:
test = test.drop(['ID_Zindi', 'ID'], axis=1)
test = test.fillna(0.0)

In [None]:
test.head()

In [None]:
# Load the trained model
model = NO2Model(embedding_dim=16, hidden_dim=64, num_layers=2, fc_out_dim=32)
model.load_state_dict(torch.load('best_NO2_model.pth'))
model.eval()  # Set the model to evaluation mode

In [None]:
class NO2Test(Dataset):
    def __init__(self, df):
        self.data = df.sort_values(by=['LAT', 'LON', 'Date']).reset_index(drop=True)
        self.locations = self.data.groupby(['LAT', 'LON']).groups
        self.location_keys = list(self.locations.keys())
    
    def __len__(self):
        return len(self.location_keys)
    
    def __getitem__(self, idx):
        location = self.location_keys[idx]
        indices = self.locations[location]
        
        # Extract each feature as a separate tensor
        lst_seq = torch.tensor(self.data.loc[indices, 'LST'].values, dtype=torch.float32).unsqueeze(1)
        aai_seq = torch.tensor(self.data.loc[indices, 'AAI'].values, dtype=torch.float32).unsqueeze(1)
        cloud_fraction_seq = torch.tensor(self.data.loc[indices, 'CloudFraction'].values, dtype=torch.float32).unsqueeze(1)
        precipitation_seq = torch.tensor(self.data.loc[indices, 'Precipitation'].values, dtype=torch.float32).unsqueeze(1)
        tropopause_pressure_seq = torch.tensor(self.data.loc[indices, 'TropopausePressure'].values, dtype=torch.float32).unsqueeze(1)
        
        
        # Extract LAT and LON
        lat = torch.tensor(location[0], dtype=torch.float32)
        lon = torch.tensor(location[1], dtype=torch.float32)
        
        
        return lst_seq, aai_seq, cloud_fraction_seq, precipitation_seq, tropopause_pressure_seq, lat, lon

In [None]:
# Create test dataset and DataLoader
test_dataset = NO2Test(test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# Run the model on the test data
with torch.no_grad():
    for idx, (lst_seq, aai_seq, cloud_fraction_seq, precipitation_seq, tropopause_pressure_seq, lat, lon) in enumerate(test_loader):
        # Forward pass
        output = model(lst_seq, aai_seq, cloud_fraction_seq, precipitation_seq, tropopause_pressure_seq, lat, lon)
        
        # Store the prediction (converting output to a Python float)
        test.loc[test_dataset.locations[test_dataset.location_keys[idx]], 'Predicted_NO2'] = output.item()

In [None]:
test = test.sort_values(by=['LAT', 'LON', 'Date']).reset_index(drop=True)
test.head(100)

In [2]:
model = AttentionModel()

In [10]:
batch = 128

features = torch.randn(batch, 15, 8)
lat = torch.randn(batch, 15, 1)
lon = torch.randn(batch, 15, 1)

output = model(features,lat,lon)

print(output)

tensor([[-0.4141],
        [-0.4849],
        [-0.5953],
        [-0.4074],
        [-0.6392],
        [-0.5918],
        [-0.6847],
        [-0.4437],
        [-0.5906],
        [-0.4310],
        [-0.5093],
        [-0.4230],
        [-0.5049],
        [-0.4352],
        [-0.4845],
        [-0.4977],
        [-0.7832],
        [-0.5389],
        [-0.3613],
        [-0.5073],
        [-0.4512],
        [-0.2741],
        [-0.6021],
        [-0.3383],
        [-0.6961],
        [-0.4471],
        [-0.5489],
        [-0.7116],
        [-0.2524],
        [-0.2971],
        [-0.4675],
        [-0.5110],
        [-0.6584],
        [-0.5725],
        [-0.5204],
        [-0.4848],
        [-0.4254],
        [-0.1992],
        [-0.4102],
        [-0.4971],
        [-0.2304],
        [-0.3034],
        [-0.3974],
        [-0.6545],
        [-0.4609],
        [-0.3045],
        [-0.6572],
        [-0.3383],
        [-0.3107],
        [-0.5964],
        [-0.7177],
        [-0.6477],
        [-0.