## 1. Imports and Utilities

In [2]:
import os
import json
import subprocess
from datetime import datetime
import pandas as pd

print('Imports ready')

Imports ready


## 2. Configuration

In [3]:
# Models to tune
MODEL_TYPES = ['NHITS', 'TIMESNET']

# Number of trials per model (passed to hpo_tuner.py)
N_TRIALS = 50

# SLURM settings
PARTITION = 'rtx3080'
TIME_LIMIT = '10:00:00'
GRES = 'gpu:1'
CPUS = '8'
CONDA_ENV = 'myenv'

# Paths
SLURM_SCRIPT = 'hpo_job.slurm'
JOB_MAP_FILE = 'hpo_current_jobs.json'

print('Config set:', MODEL_TYPES, N_TRIALS)

Config set: ['NHITS', 'TIMESNET'] 50


## 3. Submit SLURM HPO Jobs (one per model)

In [3]:
job_map = {}
for model in MODEL_TYPES:
    print(f'Submitting HPO for {model} ...')
    result = subprocess.run(
        ['sbatch',
         '--partition', PARTITION,
         '--time', TIME_LIMIT,
         '--gres', GRES,
         '--cpus-per-task', CPUS,
         SLURM_SCRIPT,
         model, str(N_TRIALS)],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        job_id = result.stdout.strip().split()[-1]
        job_map[model] = job_id
        print(f'  ✓ Submitted {model} as Job {job_id}')
        print(f'    Logs: hpo_{job_id}.log / hpo_{job_id}.err')
    else:
        print(f'  ✗ Submission failed for {model}: {result.stderr}')

if job_map:
    with open(JOB_MAP_FILE, 'w') as f:
        json.dump(job_map, f, indent=2)
    print('Saved job map to', JOB_MAP_FILE)
else:
    print('No jobs submitted.')

Submitting HPO for NHITS ...
  ✓ Submitted NHITS as Job 1471742
    Logs: hpo_1471742.log / hpo_1471742.err
Submitting HPO for TIMESNET ...
  ✓ Submitted TIMESNET as Job 1471743
    Logs: hpo_1471743.log / hpo_1471743.err
Saved job map to hpo_current_jobs.json


## 4. Monitor Job Status

In [4]:
if os.path.exists(JOB_MAP_FILE):
    with open(JOB_MAP_FILE, 'r') as f:
        job_map = json.load(f)
else:
    job_map = {}

if not job_map:
    print('No job map found. Submit jobs first (Section 3).')
else:
    for model, job_id in job_map.items():
        print(f'Checking {model} (Job {job_id}) ...')
        res = subprocess.run(
            ['squeue', '-j', job_id, '--format=%.18i %.9P %.20j %.8u %.8T %.10M %.9l %.6D %R'],
            capture_output=True, text=True
        )
        lines = res.stdout.strip().split('\n')
        if len(lines) <= 1:
            print('  ⏹️  Not in queue (completed or failed)')
        else:
            print(res.stdout)
        log_file = f'hpo_{job_id}.log'
        if os.path.exists(log_file):
            print('  Tail log:')
            tail = subprocess.run(['tail', '-n', '12', log_file], capture_output=True, text=True)
            print(tail.stdout)
        err_file = f'hpo_{job_id}.err'
        if os.path.exists(err_file):
            err_tail = subprocess.run(['tail', '-n', '6', err_file], capture_output=True, text=True)
            if err_tail.stdout.strip():
                print('  Tail err:')
                print(err_tail.stdout)

Checking NHITS (Job 1471742) ...
  ⏹️  Not in queue (completed or failed)
  Tail log:
= Node list          : tg081
= Subm/Elig/Start/End: 2025-12-21T18:13:12 / 2025-12-21T18:13:12 / 2025-12-21T18:13:13 / 2025-12-21T20:41:17
    Path                 Used     SoftQ    HardQ    Gracetime  Filec    FileQ    FiHaQ    FileGrace    
    /home/woody           171.0M  1000.0G  1500.0G        N/A   9,028    5,000K   7,500K        N/A    
    /home/hpc              40.1G   104.9G   209.7G        N/A     150K     500K   1,000K        N/A    
    /home/vault             0.0K  1048.6G  2097.2G        N/A       1      200K     400K        N/A    
=== GPU utilization ==
gpu_name, gpu_bus_id, pid, gpu_utilization [%], mem_utilization [%], max_memory_usage [MiB], time [ms]
NVIDIA GeForce RTX 3080, 00000000:DA:00.0, 402043, 67 %, 19 %, 1572 MiB, 8789107 ms

  Tail err:
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: 

## 5. Inspect Saved Best Parameters

In [5]:
best_files = {
    'NHITS': 'results/best_params_NHITS.json',
    'TIMESNET': 'results/best_params_TIMESNET.json'
}

for model, path in best_files.items():
    if os.path.exists(path):
        with open(path, 'r') as f:
            params = json.load(f)
        print(f'{model} best params (from {path}):')
        display(pd.DataFrame(params, index=[0]).T)
    else:
        print(f'{model}: best params file not found at {path}')
        print('Submit/monitor jobs first.')

NHITS best params (from results/best_params_NHITS.json):


Unnamed: 0,0
num_stacks,4.0
num_blocks,1.0
num_layers,3.0
layer_widths,256.0
lr,6.4e-05
dropout,0.110749
weight_decay,3e-06


TIMESNET best params (from results/best_params_TIMESNET.json):


Unnamed: 0,0
hidden_size,256.0
conv_hidden_size,64.0
top_k,3.0
lr,1.7e-05
dropout,0.123281


## 6. Optional: Run Local HPO (small quick test)

In [None]:
# This is a small local run (may be slow).
# Adjust model and trials as needed.
LOCAL_MODEL = 'NHITS'
LOCAL_TRIALS = 5

print(f'Running local HPO for {LOCAL_MODEL} with {LOCAL_TRIALS} trials...')
res = subprocess.run(
    ['python3', 'hpo_tuner.py', LOCAL_MODEL, str(LOCAL_TRIALS)],
    capture_output=True, text=True
)
print(res.stdout)
if res.returncode != 0:
    print('Error:')
    print(res.stderr)

## 7. Next Steps
- After HPO completes, rerun the main benchmark notebook; it will automatically pick up best params from `results/best_params_*.json`.
- Keep logs for reference: `hpo_<JOBID>.log` and `hpo_<JOBID>.err`.
- Tune `N_TRIALS`, `TIME_LIMIT`, and SLURM resources above as needed.