In [1]:
import logging
import os
import sys
import traceback

import torch

from hydra import compose, initialize_config_module
from hydra.utils import instantiate

from omegaconf import OmegaConf

from training.utils.train_utils import makedir, register_omegaconf_resolvers

os.environ["HYDRA_FULL_ERROR"] = "1"

In [2]:
def single_proc_run(local_rank, main_port, cfg, world_size):
    """Single GPU process"""
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(main_port)
    os.environ["RANK"] = str(local_rank)
    os.environ["LOCAL_RANK"] = str(local_rank)
    os.environ["WORLD_SIZE"] = str(world_size)
    try:
        register_omegaconf_resolvers()
    except Exception as e:
        logging.info(e)

    trainer = instantiate(cfg.trainer, _recursive_=False)
    trainer.run()


def single_node_runner(cfg, main_port: int):

    # CUDA runtime does not support `fork`
    torch.multiprocessing.set_start_method("spawn")

    single_proc_run(local_rank=0, main_port=main_port, cfg=cfg, world_size=1)


def format_exception(e: Exception, limit=20):
    traceback_str = "".join(traceback.format_tb(e.__traceback__, limit=limit))
    return f"{type(e).__name__}: {e}\nTraceback:\n{traceback_str}"


def add_pythonpath_to_sys_path():
    if "PYTHONPATH" not in os.environ or not os.environ["PYTHONPATH"]:
        return
    sys.path = os.environ["PYTHONPATH"].split(":") + sys.path



In [3]:
initialize_config_module("sam2", version_base="1.2")
register_omegaconf_resolvers()

In [4]:
cfg = compose(config_name="configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml")

In [5]:
# Customize the config
cfg.scratch.max_num_objects = 3
cfg.scratch.num_epochs = 10
cfg.launcher.gpus_per_node = 1
cfg.launcher.num_nodes = 1
cfg.dataset.img_folder = "/home/kasm-user/sam2_ft_runpod/mini_dataset/train/images"
cfg.dataset.gt_folder = "/home/kasm-user/sam2_ft_runpod/mini_dataset/train/annotations"
cfg.dataset.file_list_txt = "/home/kasm-user/sam2_ft_runpod/mini_dataset/list_files.txt"
cfg.trainer.checkpoint.model_weight_initializer.state_dict.checkpoint_path = (
    "/home/kasm-user/sam2_ft_runpod/checkpoints/sam2.1_hiera_base_plus.pt"
)

In [6]:
if cfg.launcher.experiment_log_dir is None:
    cfg.launcher.experiment_log_dir = os.path.join(
        os.getcwd(), "sam2_logs", "experiment_log_dir"
    )

In [7]:
print("###################### Train App Config ####################")
print(OmegaConf.to_yaml(cfg))
print("############################################################")

###################### Train App Config ####################
scratch:
  resolution: 1024
  train_batch_size: 1
  num_train_workers: 10
  num_frames: 8
  max_num_objects: 3
  base_lr: 5.0e-06
  vision_lr: 3.0e-06
  phases_per_epoch: 1
  num_epochs: 10
dataset:
  img_folder: /home/kasm-user/sam2_ft_runpod/mini_dataset/train/images
  gt_folder: /home/kasm-user/sam2_ft_runpod/mini_dataset/train/annotations
  file_list_txt: /home/kasm-user/sam2_ft_runpod/mini_dataset/list_files.txt
  multiplier: 2
vos:
  train_transforms:
  - _target_: training.dataset.transforms.ComposeAPI
    transforms:
    - _target_: training.dataset.transforms.RandomHorizontalFlip
      consistent_transform: true
    - _target_: training.dataset.transforms.RandomAffine
      degrees: 25
      shear: 20
      image_interpolation: bilinear
      consistent_transform: true
    - _target_: training.dataset.transforms.RandomResizeAPI
      sizes: ${scratch.resolution}
      square: true
      consistent_transform: true
   

In [8]:
add_pythonpath_to_sys_path()
makedir(cfg.launcher.experiment_log_dir)

True

In [9]:
single_node_runner(cfg, 4500)

INFO 2025-02-12 15:02:24,603 train_utils.py: 108: MACHINE SEED: 1230
INFO 2025-02-12 15:02:24,612 train_utils.py: 154: Logging ENV_VARIABLES
INFO 2025-02-12 15:02:24,613 train_utils.py: 155: AUDIO_PORT=4901
CLICOLOR=1
CLICOLOR_FORCE=1
COLORTERM=truecolor
CONDA_DEFAULT_ENV=sam2_ft
CONDA_EXE=/home/kasm-user/miniconda3/bin/conda
CONDA_PREFIX=/home/kasm-user/miniconda3/envs/sam2_ft
CONDA_PREFIX_1=/home/kasm-user/miniconda3
CONDA_PROMPT_MODIFIER=(sam2_ft) 
CONDA_PYTHON_EXE=/home/kasm-user/miniconda3/bin/python
CONDA_SHLVL=2
CUDA_MODULE_LOADING=LAZY
DBUS_SESSION_BUS_ADDRESS=unix:abstract=/tmp/dbus-d1a4fIDSGg,guid=e6804796ef44be3577e4721967acb58e
DEBIAN_FRONTEND=noninteractive
DESKTOP_SESSION=xfce
DISPLAY=:1.0
DISTRO=ubuntu
FORCE_COLOR=1
GIT_PAGER=cat
GOMP_SPINCOUNT=0
HOME=/home/kasm-user
HOSTNAME=d9dd18706b45
HYDRA_FULL_ERROR=1
INST_SCRIPTS=/dockerstartup/install
JPY_PARENT_PID=5817
JPY_SESSION_NAME=/home/kasm-user/sam2_ft_runpod/training/train.ipynb
JUPYTER_PASSWORD=sg3njuks7qcm5ic1au6v
KAS

grad.sizes() = [64, 256, 1, 1], strides() = [256, 1, 256, 256]
bucket_view.sizes() = [64, 256, 1, 1], strides() = [256, 1, 1, 1] (Triggered internally at /pytorch/torch/csrc/distributed/c10d/reducer.cpp:327.)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


INFO 2025-02-12 15:02:44,721 trainer.py: 950: Estimated time remaining: 00d 00h 02m
INFO 2025-02-12 15:02:44,724 trainer.py: 892: Synchronizing meters
INFO 2025-02-12 15:02:44,725 trainer.py: 830: Losses and meters: {'Losses/train_all_loss': 3.5376162827014923, 'Losses/train_all_loss_mask': 0.03490880146273412, 'Losses/train_all_loss_dice': 1.810979887843132, 'Losses/train_all_loss_iou': 0.8988578235730529, 'Losses/train_all_loss_class': 0.12960280144034186, 'Losses/train_all_core_loss': 3.5376162827014923, 'Trainer/where': 0.0875, 'Trainer/epoch': 0, 'Trainer/steps_train': 8}
INFO 2025-02-12 15:02:53,141 train_utils.py: 271: Train Epoch: [1][0/8] | Batch Time: 6.90 (6.90) | Data Time: 6.07 (6.07) | Mem (GB): 28.00 (28.00/28.00) | Time Elapsed: 00d 00h 00m | Losses/train_all_loss: 1.63e+00 (1.63e+00)
INFO 2025-02-12 15:03:00,288 trainer.py: 950: Estimated time remaining: 00d 00h 01m
INFO 2025-02-12 15:03:00,290 trainer.py: 892: Synchronizing meters
INFO 2025-02-12 15:03:00,292 trainer.

In [10]:
print("Finito")

Finito
