In [1]:
%cd /home/ec2-user
import glob

matches = glob.glob("data/cleaned_partitioned_ais/*/*/*/*.parquet")
print(f"Found {len(matches)} parquet files:")
for p in matches[:10]:
    print(" ", p)

/home/ec2-user
Found 2 parquet files:
  data/cleaned_partitioned_ais/year=2025/month=2/day=2/part-0.parquet
  data/cleaned_partitioned_ais/year=2025/month=2/day=1/part-0.parquet


In [2]:
%cd /home/ec2-user
# ─── 1) Setup imports & module path ──────────────────────────────────────────
import sys
from pathlib import Path
import time

# add your Fourier head module to path
fourier_dir = Path.home() / "repos" / "fourier-head" / "notebooks"
sys.path.insert(0, str(fourier_dir))

import polars as pl
import torch
from torch import nn, optim
from torch.utils.data import IterableDataset, DataLoader

from four_head_2D_LR import FourierHead2DLR  # your FourierHead2D_FFT implementation

# ─── 2) Data loading helper ─────────────────────────────────────────────────
def load_cleaned_data(cleaned_root: str) -> pl.DataFrame:
    print("🟡 Entering load_cleaned_data()", flush=True)
    p = Path(cleaned_root).resolve()
    print(f"    Looking under {p}", flush=True)
    files = list(p.rglob("*.parquet"))
    print(f"    Found {len(files)} parquet files", flush=True)
    if not files:
        raise FileNotFoundError(f"No parquet files under {p}")
    print("    Reading into Polars...", flush=True)
    df = pl.read_parquet([str(f) for f in files])
    print(f"🟢 Loaded DataFrame: {df.height} rows, {len(df.columns)} cols", flush=True)
    return df

# ─── 3) Streaming dataset ───────────────────────────────────────────────────
class AISForecastIterableDataset(IterableDataset):
    """
    Streams windows of past->next positions, one vessel at a time.
    """
    def __init__(self, df: pl.DataFrame, seq_len: int = 10):
        print("🟡 Initializing streaming dataset", flush=True)
        start = time.time()
        # normalize coordinates
        df = df.with_columns([
            (pl.col("lat") / 90.0).alias("lat_n"),
            (pl.col("lon") / 180.0).alias("lon_n"),
        ])
        self.df = df.sort(["mmsi", "timestamp"])
        self.mmsis = self.df["mmsi"].unique().to_list()
        self.seq_len = seq_len
        print(f"🟢 Dataset init: {len(self.mmsis)} vessels in {time.time()-start:.1f}s", flush=True)

    def __iter__(self):
        for idx, m in enumerate(self.mmsis, 1):
            grp = self.df.filter(pl.col("mmsi") == m).select(["lat_n", "lon_n"])
            coords = torch.tensor(grp.to_numpy(), dtype=torch.float32)
            N = coords.size(0)
            if N <= self.seq_len:
                continue
            for i in range(self.seq_len, N):
                past   = coords[i-self.seq_len:i]  # (seq_len,2)
                target = coords[i]                 # (2,)
                yield past, target
            if idx % 500 == 0:
                print(f"    [Dataset] streamed {idx}/{len(self.mmsis)} vessels", flush=True)

# ─── 4) Model definition ────────────────────────────────────────────────────
class TransformerForecaster(nn.Module):
    def __init__(self, seq_len: int, d_model: int, nhead: int,
                 num_layers: int, ff_hidden: int, fourier_m: int, rank: int):
        super().__init__()
        print("🟡 Building model", flush=True)
        self.input_proj = nn.Linear(2, d_model)
        self.pos_emb = nn.Parameter(torch.randn(seq_len, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, ff_hidden)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fh = FourierHead2DLR(dim_input=d_model, num_frequencies=fourier_m,rank=rank)
        print("🟢 Model built", flush=True)

    def forward(self, x: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        # x: (B, seq_len, 2), targets: (B, 2)
        h = self.input_proj(x) + self.pos_emb.unsqueeze(0)   # (B,S,d_model)
        h = self.transformer(h.transpose(0,1))               # (S,B,d_model)
        last = h[-1]                                         # (B,d_model)
        return self.fh(last, targets)                        # (B,)

# ─── 5) Training loop ───────────────────────────────────────────────────────
def train(
    df: pl.DataFrame,
    seq_len: int = 10,
    d_model: int = 64,
    nhead: int = 4,
    num_layers: int = 2,
    ff_hidden: int = 128,
    fourier_m: int = 8,
    rank: int = 4,
    batch_size: int = 64,
    lr: float = 1e-6,
    epochs: int = 5,
    device: str = None
):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🟡 Training on device: {device}", flush=True)

    ds = AISForecastIterableDataset(df, seq_len=seq_len)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=0, pin_memory=True)
    print(f"🟢 DataLoader ready with batch_size={batch_size}", flush=True)

    model = TransformerForecaster(seq_len, d_model, nhead, num_layers, ff_hidden, fourier_m, rank)
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=lr)

    for ep in range(1, epochs+1):
        print(f"\n🟡 Epoch {ep}/{epochs} start", flush=True)
        total_nll = 0.0
        count = 0
        for batch_i, (xb, yb) in enumerate(loader, 1):
            xb, yb = xb.to(device), yb.to(device)
            pdf = model(xb, yb)
            loss = -(pdf + 1e-12).log().mean()
            opt.zero_grad(); loss.backward(); opt.step()

            total_nll += loss.item() * xb.size(0)
            count += xb.size(0)
            if batch_i % 50 == 0:
                print(f"    [Epoch {ep}] Batch {batch_i} NLL {loss.item():.4f}", flush=True)
        avg_nll = total_nll / count
        print(f"🟢 Epoch {ep} done — Avg NLL: {avg_nll:.4f}", flush=True)

    return model

# ─── 6) Run everything ───────────────────────────────────────────────────────
print("🟢 Cell start: loading data", flush=True)
df = load_cleaned_data("data/cleaned_partitioned_ais")

print("🟢 Starting training", flush=True)
model = train(
    df,
    seq_len=20,
    d_model=128,
    nhead=4,
    num_layers=4,
    ff_hidden=512,
    fourier_m=512,
    batch_size=64,
    lr=1e-3,
    epochs=100,
    rank = 4
)
print("🟢 Training complete", flush=True)


/home/ec2-user
🟢 Cell start: loading data
🟡 Entering load_cleaned_data()
    Looking under /home/ec2-user/data/cleaned_partitioned_ais
    Found 2 parquet files
    Reading into Polars...
🟢 Loaded DataFrame: 19082312 rows, 9 cols
🟢 Starting training
🟡 Training on device: cuda
🟡 Initializing streaming dataset
🟢 Dataset init: 3866 vessels in 2.6s
🟢 DataLoader ready with batch_size=64
🟡 Building model




🟢 Model built

🟡 Epoch 1/100 start
    [Epoch 1] Batch 50 NLL -6.6560
    [Epoch 1] Batch 100 NLL -8.8237
    [Epoch 1] Batch 150 NLL -9.7887
    [Epoch 1] Batch 200 NLL -10.2947
    [Epoch 1] Batch 250 NLL -10.5902
    [Epoch 1] Batch 300 NLL -4.9419
    [Epoch 1] Batch 350 NLL -9.1600
    [Epoch 1] Batch 400 NLL -10.0367
    [Epoch 1] Batch 450 NLL -10.4509
    [Epoch 1] Batch 500 NLL -10.6846
    [Epoch 1] Batch 550 NLL -10.8268
    [Epoch 1] Batch 600 NLL -8.6670
    [Epoch 1] Batch 650 NLL -9.9896
    [Epoch 1] Batch 700 NLL -10.4759
    [Epoch 1] Batch 750 NLL -10.7260
    [Epoch 1] Batch 800 NLL -10.8687
    [Epoch 1] Batch 850 NLL -7.6541
    [Epoch 1] Batch 900 NLL -9.9043
    [Epoch 1] Batch 950 NLL -10.4834
    [Epoch 1] Batch 1000 NLL -10.7496
    [Epoch 1] Batch 1050 NLL -10.8917
    [Epoch 1] Batch 1100 NLL -10.9689
    [Epoch 1] Batch 1150 NLL -11.0143
    [Epoch 1] Batch 1200 NLL -11.0399
    [Epoch 1] Batch 1250 NLL -11.0583
    [Epoch 1] Batch 1300 NLL -11.0675
    [E

KeyboardInterrupt: 

In [1]:
# ╔══════════════════════════════════════════════════════════════════════════╗
# ║  AIS → Transformer → 2-D GMM forecaster                                 ║
# ║  with blended loss = NLL  +  λ·MSE(μ_k , target)                        ║
# ╚══════════════════════════════════════════════════════════════════════════╝
# 0) (optional) run inside /home/ec2-user so data paths stay the same
%cd /home/ec2-user

# ─── 1) Imports & module path ───────────────────────────────────────────────
import sys, time
from pathlib import Path
from typing import Optional

import polars as pl
import torch
from torch import nn, optim
from torch.utils.data import IterableDataset, DataLoader

# add repo so `GMMHead2D` is importable
repo_dir = Path.home() / "repos" / "fourier-head" / "notebooks"
sys.path.insert(0, str(repo_dir))

from GMM_head_2D_logsum import GMMHead2D   # numerically-stable GMM head

# ─── 2) Helper: read cleaned AIS parquet into a Polars DF ───────────────────
def load_cleaned_data(root: str) -> pl.DataFrame:
    p = Path(root).expanduser().resolve()
    files = list(p.rglob("*.parquet"))
    if not files:
        raise FileNotFoundError(f"No parquet files under {p}")
    df = pl.read_parquet([str(f) for f in files])
    print(f"Loaded {df.height:,} rows from {len(files)} parquet files")
    return df

# ─── 3) Streaming dataset ───────────────────────────────────────────────────
class AISForecastIterableDataset(IterableDataset):
    def __init__(self, df: pl.DataFrame, seq_len: int = 20):
        import time
        t0 = time.time()

        # (No Config call needed; threading already active)

        df = (
            df.with_columns([
                (pl.col("lat") / 90).alias("lat_n"),
                (pl.col("lon") / 180).alias("lon_n"),
            ])
            .sort(["mmsi", "timestamp"])        # heavy but threaded
        )
        self.df     = df
        self.mmsis  = df["mmsi"].unique().to_list()
        self.seq_len = seq_len

        print(f"🟢 Dataset ready: {len(self.mmsis):,} vessels "
              f"in {time.time()-t0:.1f}s", flush=True)

# ─── 4) Model: Transformer encoder → GMM head ───────────────────────────────
class TransformerForecasterGMM(nn.Module):
    def __init__(
        self,
        seq_len: int,
        d_model: int,
        nhead: int,
        num_layers: int,
        ff_hidden: int,
        num_components: int,
        device: str,
    ):
        super().__init__()
        self.input_proj = nn.Linear(2, d_model)
        self.pos_emb    = nn.Parameter(torch.randn(seq_len, d_model))
        enc_layer       = nn.TransformerEncoderLayer(d_model, nhead, ff_hidden)
        self.transformer = nn.TransformerEncoder(enc_layer, num_layers)
        self.gmm = GMMHead2D(
            dim_input=d_model,
            num_components=num_components,
            device=device,
        )

    def forward(self, x: torch.Tensor, targets: torch.Tensor):
        dev = self.pos_emb.device
        x       = x.to(dev, non_blocking=True)
        targets = targets.to(dev, non_blocking=True)

        h = self.input_proj(x) + self.pos_emb.unsqueeze(0)   # (B,S,d)
        h = self.transformer(h.transpose(0, 1))              # (S,B,d)
        last = h[-1]                                         # (B,d)
        return self.gmm(last, targets, return_params=True)   # log_p, means

# ─── 5) Training loop with blended loss ─────────────────────────────────────
def train_gmm(
    df: pl.DataFrame,
    *,
    seq_len: int = 20,
    d_model: int = 128,
    nhead: int = 4,
    num_layers: int = 4,
    ff_hidden: int = 512,
    num_components: int = 8,
    batch_size: int = 64,
    lr: float = 3e-4,
    epochs: int = 100,
    mse_weight: float = 0.1,           # λ
    device: Optional[str] = None,
):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    ds = AISForecastIterableDataset(df, seq_len)
    loader = DataLoader(
        ds, batch_size, shuffle=False, drop_last=True,
        pin_memory=(device == "cuda"),
    )
    model = TransformerForecasterGMM(
        seq_len, d_model, nhead, num_layers,
        ff_hidden, num_components, device
    ).to(device)
    opt = optim.Adam(model.parameters(), lr=lr)

    for ep in range(1, epochs + 1):
        tot_loss = tot_nll = tot_mse = seen = 0
        for xb, yb in loader:
            log_p, means = model(xb, yb)          # tuple
            nll  = -log_p.mean()
            mse  = ((means - yb.to(device).unsqueeze(1))**2).mean()
            loss = nll + mse_weight * mse

            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

            bsz = xb.size(0)
            tot_loss += loss.item() * bsz
            tot_nll  += nll.item()  * bsz
            tot_mse  += mse.item()  * bsz
            seen     += bsz

        print(f"[Ep {ep:03}]  loss {tot_loss/seen:6.3f}  "
              f"NLL {tot_nll/seen:6.3f}  MSE {tot_mse/seen:6.4f}")

    return model

# ─── 6) Execute end-to-end ──────────────────────────────────────────────────
df = load_cleaned_data("data/cleaned_partitioned_ais")
model = train_gmm(
    df,
    seq_len        = 20,
    d_model        = 128,
    nhead          = 4,
    num_layers     = 4,
    ff_hidden      = 512,
    num_components = 8,
    batch_size     = 64,
    lr             = 3e-4,
    epochs         = 100,
    mse_weight     = 0.1,            # λ
)
print("✅  Training finished")


/home/ec2-user
Loaded 19,082,312 rows from 2 parquet files


AttributeError: type object 'Config' has no attribute 'set_tbl_opt_activation'