In [None]:
import os
# insert token
# hf_token = os.getenv("HF_TOKEN")
assert hf_token, "HF_TOKEN is not set"

# Authenticate to access Google Cloud resources
from google.colab import auth
auth.authenticate_user()


In [None]:
!apt-get install -y python3-dev
!pip install --no-binary :all: crcmod

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Collecting crcmod
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m89.7/89.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: crcmod
  Building wheel for crcmod (setup.py) ... [?25l[?25hdone
  Created wheel for crcmod: filename=crcmod-1.7-cp311-cp311-linux_x86_64.whl size=31658 sha256=43a34b56d31267cf01452ffcb15533388c3095d49d976f7d64c06d3b2ebf9932
  Stored in directory: /root/.cache/pip/wheels/23/94/7a/8cb7d14597e6395ce969933f01aed9ea8fa5f5b4d4c8a61e99
Successfully built crcmod
Installing collected 

In [7]:

from google.colab import auth
auth.authenticate_user()

from google.cloud import storage
from pathlib import Path
import os

In [11]:
import os
from pathlib import Path
import torch
import timm
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import subprocess

split = 'val'

# Set paths
gcs_bucket = "bracs-dataset-bucket"
tile_root_gcs = f"gs://{gcs_bucket}/Tiles/{split}"
embedding_root_gcs = f"gs://{gcs_bucket}/Embeddings/{split}"
tile_root_local = Path(f"/content/tiles/{split}")
embedding_root_local = Path(f"/content/embeddings/{split}")

tile_root_local.mkdir(parents=True, exist_ok=True)
embedding_root_local.mkdir(parents=True, exist_ok=True)

# Load model
tile_encoder = timm.create_model("hf_hub:prov-gigapath/prov-gigapath", pretrained=True).cuda().eval()
print("‚úÖ Tile Encoder loaded.")
print("üßÆ Total parameters:", sum(p.numel() for p in tile_encoder.parameters()))

# Image transform
transform = transforms.Compose([
    transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])


‚úÖ Tile Encoder loaded.
üßÆ Total parameters: 1134953984


In [14]:
def encode_slide_tiles(slide_id: str):
    slide_tile_dir = tile_root_local / slide_id
    save_path = embedding_root_local / slide_id / f"{slide_id}_embeddings.pt"
    save_path.parent.mkdir(parents=True, exist_ok=True)

    image_paths = sorted([p for p in slide_tile_dir.glob("*.png") if p.name != "thumbnail.png"])
    print(f"Found {len(image_paths)} tiles for {slide_id}")

    all_embeddings = []
    all_coords = []

    for img_path in tqdm(image_paths, desc=f"üß† Encoding {slide_id}"):
        img = Image.open(img_path).convert("RGB")
        x = transform(img).unsqueeze(0).cuda()

        with torch.no_grad():
            embed = tile_encoder(x).squeeze()  # Keep on GPU

        # Extract (x, y) from filename: "x1234_y5678.png"
        name_parts = img_path.stem.split("_")
        x_coord = int(name_parts[0][1:])
        y_coord = int(name_parts[1][1:])

        all_embeddings.append(embed)
        all_coords.append([x_coord, y_coord])

    # Stack on GPU, then move to CPU once
    embeddings_tensor = torch.stack(all_embeddings).cpu()
    coords_tensor = torch.tensor(all_coords)

    num_tiles = len(all_embeddings)
    print(f"üìä {slide_id}: embeddings shape {embeddings_tensor.shape}, coords shape {coords_tensor.shape}")
    print(f"üß© {slide_id}: total {num_tiles} tile embeddings stacked.")

    torch.save({
        "embeddings": embeddings_tensor,
        "coords": coords_tensor
    }, save_path)

    print(f"‚úÖ Saved slide embeddings to {save_path}")


In [15]:
def run_encoder_pipeline_from_gcs():
    # List all subfolders in the GCS bucket
    list_command = f"gsutil ls {tile_root_gcs}/"
    result = subprocess.run(list_command.split(), capture_output=True, text=True)
    slide_dirs = [line.strip().rstrip("/") for line in result.stdout.strip().split("\n")]

    for slide_path in slide_dirs:
        slide_id = Path(slide_path).name
        print(f"\nüîΩ Downloading tiles for {slide_id}...")

        # Download to /content/tiles/{split}/BRACS_xxx
        local_slide_dir = tile_root_local / slide_id
        subprocess.run([
    "gsutil", "-m", "cp", "-r", str(slide_path), str(local_slide_dir.parent)
], check=True)

        # Run inference
        encode_slide_tiles(slide_id)

        # Upload embeddings
        gcs_target = f"{embedding_root_gcs}/{slide_id}"
        subprocess.run(["gsutil", "-m", "cp", "-r", str(embedding_root_local / slide_id), gcs_target], check=True)
        print(f"‚òÅÔ∏è Uploaded embeddings for {slide_id} to {gcs_target}")

        # Cleanup
        subprocess.run(["rm", "-rf", str(local_slide_dir)])
        subprocess.run(["rm", "-rf", str(embedding_root_local / slide_id)])


In [None]:
run_encoder_pipeline_from_gcs()



üîΩ Downloading tiles for BRACS_1003660...
Found 1 tiles for BRACS_1003660


üß† Encoding BRACS_1003660: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.02s/it]


üìä BRACS_1003660: embeddings shape torch.Size([1, 1536]), coords shape torch.Size([1, 2])
üß© BRACS_1003660: total 1 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1003660/BRACS_1003660_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1003660 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1003660

üîΩ Downloading tiles for BRACS_1003661...
Found 282 tiles for BRACS_1003661


üß† Encoding BRACS_1003661: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 282/282 [00:11<00:00, 23.57it/s]


üìä BRACS_1003661: embeddings shape torch.Size([282, 1536]), coords shape torch.Size([282, 2])
üß© BRACS_1003661: total 282 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1003661/BRACS_1003661_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1003661 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1003661

üîΩ Downloading tiles for BRACS_1261...
Found 3334 tiles for BRACS_1261


üß† Encoding BRACS_1261: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3334/3334 [02:21<00:00, 23.59it/s]


üìä BRACS_1261: embeddings shape torch.Size([3334, 1536]), coords shape torch.Size([3334, 2])
üß© BRACS_1261: total 3334 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1261/BRACS_1261_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1261 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1261

üîΩ Downloading tiles for BRACS_1271...
Found 861 tiles for BRACS_1271


üß† Encoding BRACS_1271: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 861/861 [00:36<00:00, 23.58it/s]


üìä BRACS_1271: embeddings shape torch.Size([861, 1536]), coords shape torch.Size([861, 2])
üß© BRACS_1271: total 861 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1271/BRACS_1271_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1271 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1271

üîΩ Downloading tiles for BRACS_1275...
Found 2614 tiles for BRACS_1275


üß† Encoding BRACS_1275: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2614/2614 [01:50<00:00, 23.59it/s]


üìä BRACS_1275: embeddings shape torch.Size([2614, 1536]), coords shape torch.Size([2614, 2])
üß© BRACS_1275: total 2614 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1275/BRACS_1275_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1275 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1275

üîΩ Downloading tiles for BRACS_1276...
Found 4842 tiles for BRACS_1276


üß† Encoding BRACS_1276: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4842/4842 [03:25<00:00, 23.57it/s]


üìä BRACS_1276: embeddings shape torch.Size([4842, 1536]), coords shape torch.Size([4842, 2])
üß© BRACS_1276: total 4842 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1276/BRACS_1276_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1276 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1276

üîΩ Downloading tiles for BRACS_1295...
Found 3954 tiles for BRACS_1295


üß† Encoding BRACS_1295: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3954/3954 [02:47<00:00, 23.59it/s]


üìä BRACS_1295: embeddings shape torch.Size([3954, 1536]), coords shape torch.Size([3954, 2])
üß© BRACS_1295: total 3954 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1295/BRACS_1295_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1295 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1295

üîΩ Downloading tiles for BRACS_1296...
Found 7265 tiles for BRACS_1296


üß† Encoding BRACS_1296: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7265/7265 [05:07<00:00, 23.60it/s]


üìä BRACS_1296: embeddings shape torch.Size([7265, 1536]), coords shape torch.Size([7265, 2])
üß© BRACS_1296: total 7265 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1296/BRACS_1296_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1296 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1296

üîΩ Downloading tiles for BRACS_1361...
Found 3340 tiles for BRACS_1361


üß† Encoding BRACS_1361: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3340/3340 [02:21<00:00, 23.59it/s]


üìä BRACS_1361: embeddings shape torch.Size([3340, 1536]), coords shape torch.Size([3340, 2])
üß© BRACS_1361: total 3340 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1361/BRACS_1361_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1361 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1361

üîΩ Downloading tiles for BRACS_1362...
Found 2643 tiles for BRACS_1362


üß† Encoding BRACS_1362: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2643/2643 [01:52<00:00, 23.58it/s]


üìä BRACS_1362: embeddings shape torch.Size([2643, 1536]), coords shape torch.Size([2643, 2])
üß© BRACS_1362: total 2643 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1362/BRACS_1362_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1362 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1362

üîΩ Downloading tiles for BRACS_1366...
Found 3024 tiles for BRACS_1366


üß† Encoding BRACS_1366: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3024/3024 [02:08<00:00, 23.58it/s]


üìä BRACS_1366: embeddings shape torch.Size([3024, 1536]), coords shape torch.Size([3024, 2])
üß© BRACS_1366: total 3024 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1366/BRACS_1366_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1366 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1366

üîΩ Downloading tiles for BRACS_1367...
Found 850 tiles for BRACS_1367


üß† Encoding BRACS_1367: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 850/850 [00:36<00:00, 23.57it/s]


üìä BRACS_1367: embeddings shape torch.Size([850, 1536]), coords shape torch.Size([850, 2])
üß© BRACS_1367: total 850 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1367/BRACS_1367_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1367 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1367

üîΩ Downloading tiles for BRACS_1368...
Found 2345 tiles for BRACS_1368


üß† Encoding BRACS_1368: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2345/2345 [01:39<00:00, 23.58it/s]


üìä BRACS_1368: embeddings shape torch.Size([2345, 1536]), coords shape torch.Size([2345, 2])
üß© BRACS_1368: total 2345 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1368/BRACS_1368_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1368 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1368

üîΩ Downloading tiles for BRACS_1392...
Found 3724 tiles for BRACS_1392


üß† Encoding BRACS_1392: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3724/3724 [02:37<00:00, 23.58it/s]


üìä BRACS_1392: embeddings shape torch.Size([3724, 1536]), coords shape torch.Size([3724, 2])
üß© BRACS_1392: total 3724 tile embeddings stacked.
‚úÖ Saved slide embeddings to /content/embeddings/val/BRACS_1392/BRACS_1392_embeddings.pt
‚òÅÔ∏è Uploaded embeddings for BRACS_1392 to gs://bracs-dataset-bucket/Embeddings/val/BRACS_1392

üîΩ Downloading tiles for BRACS_1393...
Found 3548 tiles for BRACS_1393


üß† Encoding BRACS_1393:  40%|‚ñà‚ñà‚ñà‚ñâ      | 1407/3548 [00:59<01:31, 23.50it/s]