In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingf

In [5]:
from datasets import Dataset, DatasetDict, Features, Value, Image
import os
import pandas as pd

# Paths to your folders
ground_truth_dir = "/Users/prajakta/Downloads/psf5_gt_cropped"
psf_convolved_dir =  "/Users/prajakta/Downloads/psf5_dat_cropped"

In [6]:
# Step 1: Match files based on common prefix
gt_files = sorted([f for f in os.listdir(ground_truth_dir) if f.endswith("_gt.png")])
psf_files = sorted([f for f in os.listdir(psf_convolved_dir) if f.endswith("_convolved.png")])

# Extract common base names (e.g., '100_B11_1_blue_maxcrop')
gt_basenames = {f.replace("_gt.png", "") for f in gt_files}
psf_basenames = {f.replace("_convolved.png", "") for f in psf_files}

common_basenames = sorted(gt_basenames & psf_basenames)

# Step 2: Create file pairs
file_pairs = [{
    "image": os.path.join(psf_convolved_dir, f"{basename}_convolved.png"),
    "label": os.path.join(ground_truth_dir, f"{basename}_gt.png")
} for basename in common_basenames]

print(f"✅ Matched {len(file_pairs)} image pairs.")

# Step 3: Convert to HF Dataset
df = pd.DataFrame(file_pairs)

features = Features({
    "image": Image(),
    "label": Image()
})

dataset = Dataset.from_pandas(df, features=features)

# Step 4: Save locally (optional before uploading)
output_path = "/Users/prajakta/Downloads/human_protein_atlas_cells_dataset"
dataset.save_to_disk(output_path)

print(f"📁 Dataset saved to {output_path}")

✅ Matched 1000 image pairs.


Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 1497.30 examples/s]

📁 Dataset saved to /Users/prajakta/Downloads/human_protein_atlas_cells_dataset





In [12]:
from datasets import load_from_disk

dataset = load_from_disk("/Users/prajakta/Downloads/human_protein_atlas_cells_dataset")

print(dataset[0].keys())  # ✅ ['image', 'label']
dataset[0]['image'].show()
dataset[0]['label'].show()

dict_keys(['image', 'label'])


In [15]:
from datasets import load_from_disk
from huggingface_hub import login

# Use this in terminal or script (not in Jupyter)
login(token="TEST_SECRET")

# Load your dataset from local disk
dataset = load_from_disk("/Users/prajakta/Downloads/human_protein_atlas_cells_dataset")

# Push to Hugging Face Hub
dataset.push_to_hub("prajaktakini/human_protein_atlas_cells_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Map:   0%|          | 0/1000 [00:00<?, ? examples/s][A
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6618.62 examples/s][A

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 86.52ba/s][A
Uploading the dataset shards: 100%|██████████| 1/1 [00:49<00:00, 49.65s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/prajaktakini/human_protein_atlas_cells_dataset/commit/34c33a895efd6d1a589529a718316a1b91301ae5', commit_message='Upload dataset', commit_description='', oid='34c33a895efd6d1a589529a718316a1b91301ae5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/prajaktakini/human_protein_atlas_cells_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='prajaktakini/human_protein_atlas_cells_dataset'), pr_revision=None, pr_num=None)

In [17]:

import numpy as np
from scipy.io import savemat
from PIL import Image  # Add this import
resize_shape = (128, 128)  # Desired image size


from datasets import load_dataset

# Load dataset from Hugging Face
hf_dataset = load_dataset("prajaktakini/human_protein_atlas_cells_dataset", split="train")

print(hf_dataset.shape)
print(hf_dataset[0])

# Inspect PSF value range to validate normalization

sample_vals = [np.array(sample["image"].convert("L"), dtype=np.float32) for sample in hf_dataset.select(range(1000))]
psf_max = np.max([np.max(im) for im in sample_vals])
psf_mean = np.mean([np.mean(im) for im in sample_vals])
print(f"🔍 PSF Sample Max: {psf_max:.2f}, Mean: {psf_mean:.2f}")
if psf_max < 10:
    print("✅ measNormalization ~5 is likely appropriate.")
else:
    print("⚠️ Consider adjusting measNormalization. Peak PSF intensity is high.")

# Set normalization
measNormalization = 77.77

# Accumulate images
diff_L = []     # PSF (input)
truthIms = []   # Ground truth

for sample in hf_dataset:
    psf_img = sample["image"].convert("L").resize(resize_shape, Image.BICUBIC)
    gt_img = sample["label"].convert("L").resize(resize_shape, Image.BICUBIC)

    psf_np = np.array(psf_img, dtype=np.float32) / measNormalization
    gt_np = np.array(gt_img, dtype=np.float32)

    diff_L.append(psf_np)
    truthIms.append(gt_np)

# Stack to [H, W, N]
diff_L = np.stack(diff_L, axis=2)
truthIms = np.stack(truthIms, axis=2)

# Save to .mat
output_matfile = "hf_microscopy_dataset.mat"
savemat(output_matfile, {
    "diff_L": diff_L,
    "truthIms": truthIms
})

print(f"✅ Saved {output_matfile} with shape diff_L: {diff_L.shape}, truthIms: {truthIms.shape}")

Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 1200.29 examples/s]


(1000, 2)
{'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x11E53F9D0>, 'label': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x11E53FB80>}
🔍 PSF Sample Max: 255.00, Mean: 77.77
⚠️ Consider adjusting measNormalization. Peak PSF intensity is high.
✅ Saved hf_microscopy_dataset.mat with shape diff_L: (128, 128, 1000), truthIms: (128, 128, 1000)
