In [1]:

!git clone https://github.com/sudeonder/wsi-retrieval.git
%cd wsi-retrieval

import sys
sys.path.append("wsi-retrieval")


Cloning into 'wsi-retrieval'...
remote: Enumerating objects: 102, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 102 (delta 42), reused 56 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (102/102), 546.03 KiB | 4.20 MiB/s, done.
Resolving deltas: 100% (42/42), done.
/content/wsi-retrieval


In [2]:
# Install required build tools and compile crcmod from source
!apt-get install -y python3-dev
!pip install --no-binary :all: crcmod

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Collecting crcmod
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: crcmod
  Building wheel for crcmod (setup.py) ... [?25l[?25hdone
  Created wheel for crcmod: filename=crcmod-1.7-cp311-cp311-linux_x86_64.whl size=31659 sha256=fdaaba4b247898c27be657dda6cb60d082f9d7cfb12032547cb776edcf24f888
  Stored in directory: /root/.cache/pip/wheels/23/94/7a/8cb7d14597e6395ce969933f01aed9ea8fa5f5b4d4c8a61e99
Successfully built crcmod
Installing collected packages: crcmod
Successfully installed crcmod-1.7


In [3]:
# Install necessary packages
!apt-get install -y openslide-tools
!pip install openslide-python numpy pillow tqdm


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 34 not upgraded.
Need to get 104 kB of archives.
After this operation, 297 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopenslide0 amd64 3.4.1+dfsg-5build1 [89.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 openslide-tools amd64 3.4.1+dfsg-5build1 [13.8 kB]
Fetched 104 kB in 1s (153 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-5build1_amd64.deb ...
Unpacking libopenslide0 (3.4.1+dfsg-5build1) ...
Selecting previously unselected package openslide-tools.

In [4]:
import os
from pathlib import Path
from utils.tile_utils import tile_wsi_if_occupied


In [5]:
# 📦 Step 1: Install dependencies
!pip install --quiet openslide-python
!apt-get install -y -qq openslide-tools
!pip install --upgrade google-cloud-storage

# 📂 Step 2: Set up GCS access
from google.colab import auth
auth.authenticate_user()

from google.cloud import storage
from pathlib import Path
import os



Collecting google-cloud-storage
  Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl.metadata (12 kB)
Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl (174 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.9/174.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-storage
  Attempting uninstall: google-cloud-storage
    Found existing installation: google-cloud-storage 2.19.0
    Uninstalling google-cloud-storage-2.19.0:
      Successfully uninstalled google-cloud-storage-2.19.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-aiplatform 1.92.0 requires google-cloud-storage<3.0.0,>=1.32.0, but you have google-cloud-storage 3.1.0 which is incompatible.[0m[31m
[0mSuccessfully installed google-cloud-storage-3.1.0


In [6]:
# ✅ Step 2: Authenticate & Mount Google Cloud Storage
from google.colab import auth
auth.authenticate_user()

# Set your bucket name here
BUCKET_NAME = "bracs-dataset-bucket"

# Mount Google Cloud Storage
from google.colab import drive
drive.mount('/content/drive')  # Optional if saving to Drive

from google.colab import auth
auth.authenticate_user()


Mounted at /content/drive


In [7]:
# ✅ Step 3: Import libraries and function
from pathlib import Path
import os
import subprocess
from utils.tile_utils import tile_wsi_if_occupied  # assumes you've written the .py file

# WSI paths
splits = ['train']  # Extend to ['train', 'val', 'test'] as needed
wsi_root_gcs = f"gs://{BUCKET_NAME}/BRACS/BRACS_WSI"
local_wsi_root = Path("/content/wsi")
output_tile_root = Path("/content/tiles")


In [8]:
# ✅ Step 4: Helper to list files in GCS folder using gsutil
def list_wsi_files(split: str):
    """
    Recursively list all .svs files in the split directory on GCS.
    Assumes WSIs are stored under split/Group_*/Type_*/WSI.svs.
    """
    wsi_root = f"{wsi_root_gcs}/{split}"
    print(f"🔍 Listing WSIs under: {wsi_root}/**/*.svs")

    # Recursive listing
    find_cmd = f"gsutil ls {wsi_root}/**.svs"
    result = subprocess.run(find_cmd.split(), capture_output=True, text=True)

    if result.returncode != 0:
        print("❌ Error while listing WSI files.")
        print(result.stderr)
        return []

    wsi_files = result.stdout.strip().split('\n')
    print(f"✅ Found {len(wsi_files)} WSIs.")
    return wsi_files


In [13]:
from pathlib import Path
import subprocess
from utils.tile_utils import tile_wsi_if_occupied  # updated version

def tile_wsi_split_iteratively(
    split: str,
    max_wsi: int = None,
    patch_size: int = 448,
    stride: int = 448,
    patch_occupancy_threshold: float = 0.5,
    level: int = 0,
    resize_dim: int = 224,
    use_otsu: bool = True,
    generate_thumbnail: bool = True,
    debug: bool = False
):
    print(f"\n🔁 Tiling WSIs for split: {split}")

    wsi_files = list_wsi_files(split)
    if max_wsi:
        wsi_files = wsi_files[:max_wsi]

    print(f"✅ Found {len(wsi_files)} WSIs.\n")

    for i, wsi_gcs_path in enumerate(wsi_files, 1):
        slide_id = Path(wsi_gcs_path).stem

        print(f"🔽 [{i}/{len(wsi_files)}] Downloading {slide_id}...")

        local_wsi_path = local_wsi_root / f"{slide_id}.svs"
        output_dir = output_tile_root / split  # do not include slide_id here; it's handled inside

        local_wsi_path.parent.mkdir(parents=True, exist_ok=True)
        output_dir.mkdir(parents=True, exist_ok=True)

        try:
            subprocess.run(["gsutil", "cp", wsi_gcs_path, str(local_wsi_path)], check=True)

            tile_wsi_if_occupied(
                wsi_path=local_wsi_path,
                output_dir=output_dir,
                patch_size=patch_size,
                stride=stride,
                resize_dim=resize_dim,
                level=level,
                patch_occupancy_threshold=patch_occupancy_threshold,
                use_otsu=use_otsu,
                generate_thumbnail=generate_thumbnail,
                debug=debug
            )

            # Upload to GCS
            gcs_tile_path = f"gs://{BUCKET_NAME}/Tiles/{split}/{slide_id}"
            subprocess.run(["gsutil", "-m", "rsync", "-r", str(output_dir / slide_id), gcs_tile_path], check=True)

            # Clean up local tiles
            shutil.rmtree(output_dir / slide_id)

        except Exception as e:
            print(f"❌ Error processing {slide_id}: {e}")


        finally:
            if local_wsi_path.exists():
                local_wsi_path.unlink()
                print(f"🧹 Deleted {slide_id}.svs from local storage.")


In [11]:
tile_wsi_split_iteratively(
    split='train',
    max_wsi=2,
    patch_size=448,
    stride=448,
    patch_occupancy_threshold=0.5,
    resize_dim=224,
    use_otsu=True,
    generate_thumbnail=True,
    debug=False
)



🔁 Tiling WSIs for split: train
🔍 Listing WSIs under: gs://bracs-dataset-bucket/BRACS/BRACS_WSI/train/**/*.svs
✅ Found 395 WSIs.
✅ Found 2 WSIs.

🔽 [1/2] Downloading BRACS_1003728...


KeyboardInterrupt: 

In [None]:
!gsutil ls -r gs://bracs-dataset-bucket/BRACS/BRACS_WSI/train/**.svs