# Definir variables

In [1]:
INPUT_VIDEO = "./videos/casa_de_cambio_2-20250901_mix_hasta_1430.mp4"
FRAMES_DIR = "./frames"     # Directorio donde se guardaran los frames extraidos del video
DATASET_DIR = "./dataset"   # Directorio donde se guardara el dataset generado por autodistill
FRAME_RATE = 1              # FPS a los que se extraeran los frames

from autodistill.detection import CaptionOntology

# Ontología para crear clases -> { "lo que el modelo buscara" : "nombre final de la clase" }
ontology = CaptionOntology({
    "vehicle": "vehicle",
    "person": "person"
})

# Extraer Frames de Video en Alta calidad

In [2]:
import subprocess
import shutil
from pathlib import Path

if not INPUT_VIDEO:
    raise ValueError("Set INPUT_VIDEO to the path of the video file.")
if not FRAMES_DIR:
    raise ValueError("Set FRAMES_DIR to the output directory for frames.")
if not FRAME_RATE or FRAME_RATE <= 0:
    raise ValueError("FRAME_RATE must be a positive number.")
if shutil.which("ffmpeg") is None:
    raise EnvironmentError("ffmpeg is not installed or not in PATH.")

in_path = Path(INPUT_VIDEO)
out_dir = Path(FRAMES_DIR) / in_path.stem
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
output_pattern = str(out_dir / "frame_%06d.png")  # PNG for lossless (max quality)

cmd = [
    "ffmpeg",
    "-hide_banner",
    "-hwaccel", "cuda",
    "-loglevel", "error",
    "-i", str(in_path),
    "-vf", f"fps={FRAME_RATE}",
    "-vsync", "0",
    "-y",
    output_pattern,
]

subprocess.run(cmd, check=True)
print(f"Frames extracted to: {out_dir}")

Frames extracted to: frames/casa_de_cambio_2-20250901_mix_hasta_1430


# Etiquetar usando Grounding DINO

In [4]:
import torch

# truquito para que no de problemas de CUDA con GPUs RTX 3000
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    raise EnvironmentError("CUDA is not available on this machine.")

print(f"using device: {device}")

if device.type == "cuda":
    # use bfloat16 for the entire notebook
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

using device: cuda


In [5]:
from autodistill_grounding_dino import GroundingDINO

base_model = GroundingDINO(ontology=ontology)

print(f"Etiquetando frames en ./frames/{out_dir.name}")

base_model.label(input_folder=f"./frames/{out_dir.name}", extension=".png", output_folder=f"./dataset/{out_dir.name}")

print(f"Dataset creado en: ./dataset/{out_dir.name}")

Importing from timm.models.layers is deprecated, please import via timm.layers
torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4322.)


trying to load grounding dino directly
final text_encoder_type: bert-base-uncased
Etiquetando frames en ./frames/casa_de_cambio_2-20250901_mix_hasta_1430


Labeling ./frames/casa_de_cambio_2-20250901_mix_hasta_1430/frame_000205.png:   0%|          | 0/262 [00:00<?, ?it/s]The `device` argument is deprecated and will be removed in v5 of Transformers.
torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
None of the inputs have requires_grad=True. Gradients will be None
`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
Labeling ./frames/casa_de_cambio_2-20250901_mix_hasta_1430/frame_000190.png: 100%|██████████| 262/262 [01:48<00:00,  2.41it/s]


Labeled dataset created - ready for distillation.
Dataset creado en: ./dataset/casa_de_cambio_2-20250901_mix_hasta_1430


# Combinar imagenes y etiquetas

In [None]:
base_dir = Path(DATASET_DIR) / out_dir.name

dst_images = base_dir / "images"
dst_annotations = base_dir / "annotations"
dst_images.mkdir(parents=True, exist_ok=True)
dst_annotations.mkdir(parents=True, exist_ok=True)

def move_all_files(src_dir: Path, dst_dir: Path, split_tag: str | None, exts=None) -> int:
    if not src_dir.is_dir() or src_dir.resolve() == dst_dir.resolve():
        return 0
    count = 0
    for p in src_dir.iterdir():
        if not p.is_file() or (exts and p.suffix.lower() not in exts):
            continue
        target = dst_dir / p.name
        if target.exists():
            stem, suffix = p.stem, p.suffix
            tag = f"_{split_tag}" if split_tag else ""
            i = 0
            while True:
                suffix_i = "" if i == 0 else f"_{i}"
                candidate = dst_dir / f"{stem}{tag}{suffix_i}{suffix}"
                if not candidate.exists():
                    target = candidate
                    break
                i += 1
        shutil.move(str(p), str(target))
        count += 1
    return count

img_exts = {".jpg", ".jpeg", ".png"}
label_exts = {".txt"}

moved_imgs = 0
moved_lbls = 0
for split in ("train", "valid"):
    moved_imgs += move_all_files(base_dir / split / "images", dst_images, split, img_exts)
    moved_lbls += move_all_files(base_dir / split / "labels", dst_annotations, split, label_exts)

print(f"Movidos {moved_imgs} imágenes y {moved_lbls} etiquetas")

# Remove train and valid directories (since these are empty now)
shutil.rmtree(base_dir / "train", ignore_errors=True)
shutil.rmtree(base_dir / "valid", ignore_errors=True)

Movidos 262 imágenes y 262 etiquetas
