In [3]:
# %load_ext cudf.pandas
%pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl (40.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-18.1.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import numpy as np
import glob
import datetime
from datetime import datetime
import bz2
import re
import glob
from tqdm import tqdm
import random
from datetime import timedelta

# This mutes useless warnings:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [4]:
#first preprocess the data with:
# This will take a VERY long time, I recommend running in the terminal
# !python dataloader.py
# %pip install metar-taf-parser-mivek

Collecting metar-taf-parser-mivek
  Downloading metar_taf_parser_mivek-1.9.0-py3-none-any.whl.metadata (9.5 kB)
Downloading metar_taf_parser_mivek-1.9.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.1/107.1 kB[0m [31m730.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: metar-taf-parser-mivek
Successfully installed metar-taf-parser-mivek-1.9.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
from data_loader import NASAAirportDataset

# this will take a while
train_ds = NASAAirportDataset('KCLT', 'data/preprocess/train', n_presamples=1000, scale_min=0, scale_max=1)



loading first position
loading lamp
loading runways
loading mfs
loading tbfm
load tfm
normalizing numeric and categorical data
gathering presamples ...


100%|██████████| 1000/1000 [52:29<00:00,  3.15s/it]


In [3]:
from data_loader import NASAAirportDataset
test_ds = NASAAirportDataset('KCLT', 'data/preprocess/train', n_presamples=10, scale_min=0, scale_max=1)


loading first position
loading lamp
loading runways
loading mfs
loading tbfm
load tfm
normalizing numeric and categorical data


KeyboardInterrupt: 

In [6]:
import pickle

pickle.dump(train_ds, open('train_ds.pkl', 'wb'))
pickle.dump(test_ds, open('test_ds.pkl', 'wb'))

In [1]:
import pickle

train_ds = pickle.load(open('train_ds.pkl', 'rb'))


In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AirportThroughputModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dims=[256, 128, 64], output_dim=12, dropout=0.3):
        super(AirportThroughputModel, self).__init__()

        # Define a simple feed-forward network with batch norm and dropout
        layers = []
        prev_dim = input_dim
        for hdim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hdim))
            layers.append(nn.BatchNorm1d(hdim))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev_dim = hdim
        # Final output layer
        layers.append(nn.Linear(prev_dim, output_dim))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        # x is expected to have shape [batch_size, input_dim]
        logits = self.network(x)  # [batch_size, output_dim]
        # Do not apply softmax here if using CrossEntropyLoss.
        return logits


# Example usage:
# Suppose you have a single batch from the dataset:
# x, y = next(iter(data_loader))
# input_dim = x.shape[1]  # number of features
# model = AirportThroughputModel(input_dim=input_dim)
# model = model.to('cuda')  # or 'cpu', depending on your setup
# output = model(x)
# loss = criterion(output, y.argmax(dim=1))  # if y is one-hot encoded


In [23]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Assuming you have created these datasets already
# train_dataset = NASAAirportDataset(...)
# val_dataset = NASAAirportDataset(...)

# Example DataLoaders
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_loader = DataLoader(test_ds, batch_size=64, shuffle=False, drop_last=False)

# Example model instantiation
# First, let's fetch a single batch to determine input_dim (if not known ahead of time)
x_sample, y_sample = next(iter(train_loader))
input_dim = x_sample.shape[1]
model = AirportThroughputModel(input_dim=input_dim, hidden_dims=[256, 128, 64], output_dim=13, dropout=0.3)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for x, y in loader:
        x = x.to(device)
        y = y.to(device)
        # Convert one-hot target to class indices
        y_indices = y.argmax(dim=1)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y_indices)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * x.size(0)
        # Compute accuracy
        _, preds = torch.max(logits, 1)
        correct += (preds == y_indices).sum().item()
        total += x.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            y_indices = y.argmax(dim=1)

            logits = model(x)
            loss = criterion(logits, y_indices)
            running_loss += loss.item() * x.size(0)
            _, preds = torch.max(logits, 1)
            correct += (preds == y_indices).sum().item()
            total += x.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")

# After training, you can save the model:
torch.save(model.state_dict(), "airport_throughput_model.pth")



KeyboardInterrupt: 

In [18]:
import torch

train_ds[0][0].shape

x_sample, y_sample = next(iter(train_loader))
input_dim = x_sample.shape
input_dim
y_sample.shape




torch.Size([64, 13])

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# dataset = NASAAirportDataset(airport_code='KJFK', data_dir='/path/to/data', lamp_df=lamp_df, to_tensor=True)


input_dim = 1326
output_dim = 4 * 3 + 1  # 15 min intervals across 3 hours
hidden_dim = 64

class SimpleFeedForward(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleFeedForward, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        return self.layers(x)

model = SimpleFeedForward(input_dim, hidden_dim, output_dim)
model.to('cuda')

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 1000
model.train()
losses = []
# for epoch, time in (pbar := tqdm(enumerate(time_sampler(num_epochs)))):
for epoch in range(num_epochs):
    running_loss = 0.0
    epoch_len = 0
    n_steps = len(train_ds)

    for x_batch, y_batch in (pbar := tqdm(data_itr(n_steps), total=n_steps)):
        if(len(x_batch) == 0):
            pbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], running_loss: {running_loss :.4f}, Loss: (no flights at timestep)")
            continue
        epoch_len = len(x_batch)
        optimizer.zero_grad()
        outputs = model(x_batch)  # forward pass
        loss = criterion(outputs, y_batch)
        if loss.isnan():
            print("Error: nan loss.... WHY?!?!?!?")
            continue
        loss.backward()  # backpropagate
        optimizer.step()  # update weights

        running_loss += loss.item() * x_batch.size(0)
        losses.append(loss)
        pbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], running_loss: {running_loss :.4f}, Loss: {loss :.4f}")

    epoch_loss = running_loss / epoch_len
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss :.4f}")

print("Training completed.")




Epoch [1/1000], running_loss: 9.9633, Loss: 0.0766:   0%|          | 2/225521 [00:06<207:43:28,  3.32s/it]

Error: nan loss.... WHY?!?!?!?


Epoch [1/1000], running_loss: 10.0417, Loss: 0.0784:   0%|          | 4/225521 [00:12<190:44:43,  3.04s/it]

Error: nan loss.... WHY?!?!?!?


Epoch [1/1000], running_loss: 10.0417, Loss: 0.0784:   0%|          | 5/225521 [00:15<186:33:13,  2.98s/it]

Error: nan loss.... WHY?!?!?!?


Epoch [1/1000], running_loss: 10.1826, Loss: 0.0705:   0%|          | 6/225521 [00:17<173:39:42,  2.77s/it]

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu11==24.10.*
  Downloading https://pypi.nvidia.com/cudf-cu11/cudf_cu11-24.10.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting dask-cudf-cu11==24.10.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu11/dask_cudf_cu11-24.10.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 kB[0m [31m856.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cuml-cu11==24.10.*
  Downloading https://pypi.nvidia.com/cuml-cu11/cuml_cu11-24.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (1372.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 GB[0m [31m690.6 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0mm
[?25hCollecting cugraph-cu11==24.10.*
  Dow

In [5]:

import matplotlib.pyplot as plt
losses = [x.item() for x in losses]
print(losses)
plt.figure(figsize=(10, 6))
plt.plot(losses, marker='o', linestyle='-', label="Training Loss")
# plt.title(title, fontsize=16)
# plt.xlabel(xlabel, fontsize=12)
# plt.ylabel(ylabel, fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(range(len(losses)))  # Show ticks for each epoch
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()

NameError: name 'losses' is not defined