In [1]:
import sys
sys.path.append('..')

from mlcloud import ferranti_exec, ferranti_print_logs

def ferranti_exec_cpu(command):
    sbatch_command = """sbatch <<'EOF'
#!/bin/bash
#SBATCH --time=3-00:00:00  # Runtime in D-HH:MM:SS    
#SBATCH --output=/weka/luxburg/sbordt10/logs/exec_cpu/%j.out  
#SBATCH --error=/weka/luxburg/sbordt10/logs/exec_cpu/%j.err   
#SBATCH --open-mode=append
#SBATCH --job-name=ferranti-exec-cpu
#SBATCH --partition=cpu-ferranti
#SBATCH --nodes=1  
#SBATCH --ntasks=1       
#SBATCH --cpus-per-task=32    
#SBATCH --mem=256G   

scontrol show job ${SLURM_JOB_ID}
export WANDB__SERVICE_WAIT=6000
source activate tp-theory-new
"""
    ferranti_exec(sbatch_command + command + "\nEOF")


def ferranti_exec_H100(command):
    sbatch_command = """sbatch <<'EOF'
#!/bin/bash
#SBATCH --time=3-00:00:00  # Runtime in D-HH:MM:SS    
#SBATCH --output=/weka/luxburg/sbordt10/logs/exec_h100/%j.out  
#SBATCH --error=/weka/luxburg/sbordt10/logs/exec_h100/%j.err   
#SBATCH --open-mode=append
#SBATCH --job-name=ferranti-exec-h100
#SBATCH --partition=h100-ferranti
#SBATCH --nodes=1  
#SBATCH --ntasks=1       
#SBATCH --cpus-per-task=8    
#SBATCH --mem=128G   
#SBATCH --gres=gpu:1 

scontrol show job ${SLURM_JOB_ID}
nvidia-smi
export NCCL_TIMEOUT=1800000
export WANDB__SERVICE_WAIT=6000
source activate tp-theory-new
"""
    ferranti_exec(sbatch_command + command + "\nEOF")

## Monitor and control jobs

In [2]:
ferranti_print_logs("logs/pythia-14m-mixed-lr-sweep")

Content of logs/pythia-14m-mixed-lr-sweep/45282.out:
JobId=45282 JobName=pythia-14m-mixed-lr-sweep
   UserId=sbordt10(4198) GroupId=luxburg(4018) MCS_label=N/A
   Priority=63351 Nice=0 Account=luxburg QOS=normal
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:41 TimeLimit=3-00:00:00 TimeMin=N/A
   SubmitTime=2025-03-17T15:45:13 EligibleTime=2025-03-17T15:45:13
   AccrueTime=2025-03-17T15:45:13
   StartTime=2025-03-17T15:45:13 EndTime=2025-03-20T15:45:13 Deadline=N/A
   PreemptEligibleTime=2025-03-17T15:46:13 PreemptTime=None
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=2025-03-17T15:45:13 Scheduler=Main
   Partition=h100-ferranti AllocNode:Sid=ferranti-login001:1741294
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=mlcbm005
   BatchHost=mlcbm005
   NumNodes=1 NumCPUs=12 NumTasks=1 CPUs/Task=12 ReqB:S:C:T=0:0:*:*
   ReqTRES=cpu=12,mem=128G,node=1,billing=102,gres/gpu=1
   AllocTRES=cpu=12,mem=128G,n

In [4]:
ferranti_exec("cd litgpt && git pull")
ferranti_exec("cd limitations_of_tp_theory && git pull")

Updating 423faa0..eaf6509
Fast-forward
 extensions/thunder/pretrain.py             |   2 +-
 litgpt/config.py                           |  11 +-
 litgpt/finetune/adapter.py                 |   2 +-
 litgpt/finetune/adapter_v2.py              |   2 +-
 litgpt/finetune/full.py                    |   2 +-
 litgpt/finetune/lora.py                    |   2 +-
 litgpt/model.py                            |  18 +-
 litgpt/monitor.py                          | 554 +++++++++++++++++------------
 litgpt/pretrain.py                         | 253 ++++++++-----
 litgpt/utils.py                            |   4 +-
 pretrain-experiment/DclmData.py            |  75 ++++
 pretrain-experiment/pretrain-experiment.py | 273 ++++++++++++++
 pretrain-experiment/project_utils.py       |  71 ++++
 tests/test_utils.py                        |   8 +-
 14 files changed, 949 insertions(+), 328 deletions(-)
 create mode 100644 pretrain-experiment/DclmData.py
 create mode 100644 pretrain-experiment/pretrain-experimen

In [3]:
ferranti_exec("cd limitations_of_tp_theory  && git checkout torch.compile-and-hooks")


error: pathspec 'torch.compile-and-hooks' did not match any file(s) known to git



In [28]:
ferranti_exec("squeue --me")

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
             45282 h100-ferr pythia-1 sbordt10  R       3:26      1 mlcbm005




In [22]:
ferranti_exec("squeue -u sbordt10  --start")

             JOBID PARTITION     NAME     USER ST          START_TIME  NODES SCHEDNODES           NODELIST(REASON)




In [11]:
# cancel a specific job
ferranti_exec("scancel 45262")





## Setup the environment

In [None]:
ferranti_exec_H100("conda create -n tp-theory-new python=3.11")

Submitted batch job 42263




In [11]:
ferranti_exec_H100("conda init && activate tp-theory-new && pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")

Submitted batch job 42264




In [2]:
ferranti_exec_H100("source activate tp-theory-new && pip install wandb tenacity datasets h5py")

Submitted batch job 42265




In [None]:
ferranti_exec_H100("source activate tp-theory-new && pip install litdata==0.2.28")

Submitted batch job 42266




In [9]:
ferranti_exec_H100("source activate tp-theory-new && git clone git@github.com:sbordt/litgpt.git && cd litgpt && pip install -e .")

Submitted batch job 42267




In [13]:
ferranti_exec_H100("source activate tp-theory-new && git clone git@github.com:sbordt/limitations_of_tp_theory.git")

Submitted batch job 42268




In [15]:
ferranti_exec_H100("source activate tp-theory-new && cd limitations_of_tp_theory && litgpt download EleutherAI/pythia-14m --tokenizer_only true")

Submitted batch job 42269




In [31]:
ferranti_exec("pwd")

/weka/luxburg/sbordt10




# Final Experiments

### AdamW Coordinate check

In [15]:
# lr that is optimal for widht=256 (lr = 0.01) for all widhts
lr = 0.01

for width in [4096]:
    ferranti_exec(f"cd /weka/luxburg/sbordt10/limitations_of_tp_theory/scripts/ferranti/coordinate_check && sbatch standard-transformer-coordinate-check-init.sh {lr} {width} 700 4")

Submitted batch job 52312




In [16]:
# lr*=(256/width)
lr_base = 0.01

for width in [4096]:
    lr = lr_base * (256. / width)
    ferranti_exec(f"cd /weka/luxburg/sbordt10/limitations_of_tp_theory/scripts/ferranti/coordinate_check && sbatch standard-transformer-coordinate-check-init.sh {lr} {width} 700 4")

Submitted batch job 52313




## Run Jobs!

### download dclm baseline data

In [None]:
ferranti_exec_cpu("cd limitations_of_tp_theory && python batch-downloader.py -o /weka/luxburg/sbordt10/dclm-baseline-1.0/part1 -f dclm-baseline-1.0-urls-part1.txt")

### tokenize

In [33]:
ferranti_exec_cpu("cd limitations_of_tp_theory && python tokenize_dclm_baseline.py --data_files '/weka/luxburg/sbordt10/dclm-baseline-1.0/part1/*.parquet' --output_dir '/weka/luxburg/sbordt10/dclm-baseline-1.0-tokenized/part1-train/' --tokenizer 'checkpoints/EleutherAI/pythia-14m' --validation_output_dir '/weka/luxburg/sbordt10/dclm-baseline-1.0-tokenized/part1-val/' --num_workers 32")

Submitted batch job 43149




### compress tokenized data for download to galvani

In [10]:
ferranti_exec_cpu("tar -czvf /home/luxburg/sbordt10/dclm-baseline-1.0-tokenized-preview.tar.gz /home/luxburg/sbordt10/dclm-baseline-1.0-tokenized-preview")

Submitted batch job 45867




### pythia-14m widh=4096 lr sweep on a single H100

## development and debugging

In [3]:
ferranti_exec("cd /home/luxburg/sbordt10/litgpt && git pull")
ferranti_exec("cd /home/luxburg/sbordt10/limitations_of_tp_theory && git pull")

Updating f7ee1f9..a25f0ec
Fast-forward
 litgpt/monitor.py                          |  2 --
 litgpt/pretrain.py                         | 10 ++++++++++
 pretrain-experiment/pretrain-experiment.py |  3 +++
 3 files changed, 13 insertions(+), 2 deletions(-)

From github.com:sbordt/litgpt
   f7ee1f9..a25f0ec  main       -> origin/main

Updating 8a4a7d4..217d220
Fast-forward
 notebooks/contorl_ferranti.ipynb                   | 1579 +++++++++++---------
 notebooks/control_galvani.ipynb                    |  242 +--
 notebooks/simple-training-loop.ipynb               |  115 +-
 ... standard-transformer-coordinate-check-init.sh} |   11 +-
 .../standard-transformer-coordinate-check-init.sh  |   48 +
 5 files changed, 1164 insertions(+), 831 deletions(-)
 rename scripts/ferranti/coordinate_check/{standard-transformer.sh => standard-transformer-coordinate-check-init.sh} (83%)
 create mode 100644 scripts/galvani/coordinate_check/standard-transformer-coordinate-check-init.sh

From github.com:sbord

In [3]:
ferranti_print_logs("/weka/luxburg/sbordt10/logs/exec_h100/", num_files=2)

Content of /weka/luxburg/sbordt10/logs/exec_h100/52165.err:
INFO:root:Run directory: /home/luxburg/sbordt10/moritz_sebastian_2025/development and debugging/pretrain-pythia-14m-id=20250422205505
Using bfloat16 Automatic Mixed Precision (AMP)
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: sbordt (train-on-test) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.8
wandb: Run data is saved locally in ./wandb/run-20250422_205512-utb6tl6p
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run pretrain-pythia-14m-id=20250422205505
wandb: ⭐️ View project at https://wandb.ai/mup_limitations/development%20and%20debugging
wandb: 🚀 View run at https://wandb.ai/mup_limitations/development%20and%20debugging/runs/utb6tl6p
[rank: 0] Seed set to 42
INFO:ModuleMonitor:Logged 14905 keys at step 1. Total size of log data: 0.00 MB
INFO:Mod

In [None]:
ferranti_exec("cd /home/luxburg/sbordt10/litgpt && git pull")
ferranti_exec("cd /home/luxburg/sbordt10/limitations_of_tp_theory && git pull")

ferranti_exec_H100("""cd /home/luxburg/sbordt10/litgpt/pretrain-experiment && python pretrain-experiment.py \
    --experiment_name "development and debugging" \
    --output_dir "/home/luxburg/sbordt10/moritz_sebastian_2025"   \
    --data_dir "/home/luxburg/sbordt10/dclm-baseline-1.0-tokenized" \
    --save_interval 50 \
    --model "pythia-14m" \
    --width 4096 \
    --max_tokens 15728640 \
    --max_seq_length 512 \
    --global_batch_size 256 \
    --micro_batch_size 4 \
    --lr 0.01 \
    --warmup_steps 700 \
    --precision "bf16-mixed" \
    --seed 42 \
    --qk_norm \
    --reference_model "init" \
    --mup_coordinate_check """)

Already up to date.


Already up to date.


Submitted batch job 52264




In [19]:
ferranti_exec("squeue --me")

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
             52313 h100-ferr coordina sbordt10 PD       0:00      1 (Resources)
             52312 h100-ferr coordina sbordt10  R       8:27      1 mlcbm007


