In [1]:
import zarr
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from huggingface_hub import snapshot_download

In [50]:
# Specify the mission you want to download.
mission = "2024-10-01-11-47-44"

# Download the full dataset
#allow_patterns = [f"*"]

# Download all data from a single mission
allow_patterns = [f"{mission}/*"]

# Download a specific topic
#topic = "alphasense_front_center"
#allow_patterns = [f"{mission}/*{topic}*", f"{mission}/*.yaml"]


# If this is interuppted during download, simply re-run the block and huggingface_hub will resume the download without re-downloading the already downloaded files.
hugging_face_data_cache_path = snapshot_download(repo_id="leggedrobotics/grand_tour_dataset", allow_patterns=allow_patterns, repo_type="dataset")

Fetching 118 files: 100%|██████████| 118/118 [18:53<00:00,  9.61s/it]  


In [51]:
hugging_face_data_cache_path

'C:\\Users\\rohan\\.cache\\huggingface\\hub\\datasets--leggedrobotics--grand_tour_dataset\\snapshots\\8b9e7951a70081e04edfaf9434f809c7d53d2964'

In [52]:
from pathlib import Path

# Define the destination directory
dataset_folder = Path("~/grand_tour_dataset/missions").expanduser()
dataset_folder.mkdir(parents=True, exist_ok=True)

# Print for confirmation
print(f"Data will be extracted to: {dataset_folder}")

Data will be extracted to: C:\Users\rohan\grand_tour_dataset\missions


In [53]:
from pathlib import Path
import fnmatch
import tarfile
import shutil

def move_dataset(cache, dataset_folder, allow_patterns=("*",)):
    """
    Copy/extract a HF snapshot into dataset_folder:
      - Matches files using allow_patterns on RELATIVE POSIX paths (works on Windows)
      - Resolves Windows `.symlink` stubs to real blob targets
      - Extracts real `.tar` archives into the parent of their destination path
      - Copies other files (e.g., .zgroup, .zattrs), dropping the `.symlink` suffix
    """
    cache = Path(cache)
    dataset_folder = Path(dataset_folder)
    dataset_folder.mkdir(parents=True, exist_ok=True)

    def allowed(rel_posix: str) -> bool:
        return any(fnmatch.fnmatch(rel_posix, pat) for pat in allow_patterns)

    def resolve_hf_symlink(p: Path) -> Path:
        """
        On Windows, HF writes tiny text files with `.symlink` extension.
        The file content is the absolute path to the real blob/file.
        """
        # Some anti-virus tools can lock files; retry lightly if needed.
        target_text = p.read_text(encoding="utf-8").strip()
        target = Path(target_text)
        if not target.is_absolute():
            target = (p.parent / target).resolve()
        return target

    # 1) List all files in the snapshot
    candidates = [p for p in cache.rglob("*") if p.is_file()]
    # 2) Filter using relative POSIX paths against allow_patterns
    files = []
    for p in candidates:
        rel_posix = p.relative_to(cache).as_posix()
        if allowed(rel_posix):
            files.append(p)

    # 3) Walk the selected files and either extract (if tar) or copy
    for src in files:
        rel = src.relative_to(cache)

        # If this is a .symlink, resolve to the real blob target
        is_symlink_stub = (src.suffix == ".symlink")
        if is_symlink_stub:
            real = resolve_hf_symlink(src)
            # Where should it land? use the same relative path but drop the .symlink suffix
            dst_path = (dataset_folder / rel).with_suffix("")  # strip ".symlink"
            real_suffix = real.suffix.lower()
        else:
            real = src
            dst_path = dataset_folder / rel
            real_suffix = real.suffix.lower()

        dst_path.parent.mkdir(parents=True, exist_ok=True)

        # 3a) Tar? -> extract into the parent folder of the destination
        if real_suffix == ".tar":
            try:
                with tarfile.open(real, mode="r:") as tf:  # plain .tar (not gz)
                    tf.extractall(path=dst_path.parent)
            except tarfile.ReadError as e:
                print(f"[WARN] Bad tar: {real} :: {e}")
            except FileNotFoundError:
                print(f"[WARN] Missing blob for: {src}")
        else:
            # 3b) Not a tar -> copy (this includes .zgroup, .zattrs, YAML, etc.)
            # If the real target doesn't exist (stale symlink), warn and continue
            if not real.exists():
                print(f"[WARN] Missing blob for: {src}")
                continue
            try:
                shutil.copy2(real, dst_path)
            except PermissionError as e:
                # Occasional Windows lock issues
                print(f"[WARN] Permission error copying {real} -> {dst_path}: {e}")

    print(f"Moved data from {cache} to {dataset_folder}!")
print(dataset_folder)
move_dataset(hugging_face_data_cache_path, dataset_folder, allow_patterns=allow_patterns)

C:\Users\rohan\grand_tour_dataset\missions
Moved data from C:\Users\rohan\.cache\huggingface\hub\datasets--leggedrobotics--grand_tour_dataset\snapshots\8b9e7951a70081e04edfaf9434f809c7d53d2964 to C:\Users\rohan\grand_tour_dataset\missions!
