In [2]:
# Import modules
from pathlib import Path
import pandas as pd
import glob
import sys
from tqdm import tqdm
from itertools import islice

# Add the current directory to the path
current_path = Path().resolve()
sys.path.append(str(current_path))

# Import custom utilities
from label_data_utils import generate_label_csv
from augment_data_utils import augment_images_from_csv

In [3]:
# Define root directory and CSV paths
ROOT_DIR = Path('C:/Users/rsriram3/Documents/ind_study')  # Update this with your Windows path
LABELLED_CSV = ROOT_DIR / "original_image_metrics.csv"
AUGMENTED_CSV = ROOT_DIR / "augmented_images_metrics.csv"

In [4]:
# Define dataset paths
VEDAI_PATHS = [
    "VEDAI_dataset/VEDAI_512/images",
    "VEDAI_dataset/VEDAI_1024/images"
]

SHH_PATHS = [
    "ShanghaiTech Data/SHHA/images",
    "ShanghaiTech Data/SHHB/images"
]

original_data = []

In [5]:
def safe_image_scan(path, label, extensions=(".png", ".jpg", ".jpeg")):
    full_path = ROOT_DIR / path
    count = 0
    for ext in extensions:
        for img_path in full_path.rglob(f"*{ext}"):
            try:
                rel_img = str(img_path.relative_to(ROOT_DIR))
                original_data.append({"image": rel_img, "label": label})
                count += 1
                if count % 500 == 0:
                    print(f" → {count} images processed from {path}")
            except KeyboardInterrupt:
                print(f"Interrupted while scanning {path} at {count} files.")
                return

In [6]:
for path in VEDAI_PATHS:
    print(f"Scanning VEDAI path: {path}")
    safe_image_scan(path, label=0, extensions=(".png",))

Scanning VEDAI path: VEDAI_dataset/VEDAI_512/images
 → 500 images processed from VEDAI_dataset/VEDAI_512/images
 → 1000 images processed from VEDAI_dataset/VEDAI_512/images
 → 1500 images processed from VEDAI_dataset/VEDAI_512/images
 → 2000 images processed from VEDAI_dataset/VEDAI_512/images
 → 2500 images processed from VEDAI_dataset/VEDAI_512/images
 → 3000 images processed from VEDAI_dataset/VEDAI_512/images
 → 3500 images processed from VEDAI_dataset/VEDAI_512/images
 → 4000 images processed from VEDAI_dataset/VEDAI_512/images
 → 4500 images processed from VEDAI_dataset/VEDAI_512/images
 → 5000 images processed from VEDAI_dataset/VEDAI_512/images
 → 5500 images processed from VEDAI_dataset/VEDAI_512/images
 → 6000 images processed from VEDAI_dataset/VEDAI_512/images
 → 6500 images processed from VEDAI_dataset/VEDAI_512/images
 → 7000 images processed from VEDAI_dataset/VEDAI_512/images
 → 7500 images processed from VEDAI_dataset/VEDAI_512/images
 → 8000 images processed from VEDA

In [7]:
for path in SHH_PATHS:
    print(f"Scanning SHH path: {path}")
    safe_image_scan(path, label=1, extensions=(".jpg", ".jpeg"))

Scanning SHH path: ShanghaiTech Data/SHHA/images
 → 500 images processed from ShanghaiTech Data/SHHA/images
 → 1000 images processed from ShanghaiTech Data/SHHA/images
 → 1500 images processed from ShanghaiTech Data/SHHA/images
Scanning SHH path: ShanghaiTech Data/SHHB/images
 → 500 images processed from ShanghaiTech Data/SHHB/images
 → 1000 images processed from ShanghaiTech Data/SHHB/images
 → 1500 images processed from ShanghaiTech Data/SHHB/images
 → 2000 images processed from ShanghaiTech Data/SHHB/images
 → 2500 images processed from ShanghaiTech Data/SHHB/images


In [8]:
# Inspect scanned image paths before computing metrics
scanned_df = pd.DataFrame(original_data)
print("--- Dataset Summary ---")
print("Total images:", len(scanned_df))
print("Label counts:", scanned_df['label'].value_counts())
print("DataFrame shape:", scanned_df.shape)
print("First few rows:")
print(scanned_df.head())

--- Dataset Summary ---
Total images: 18775
Label counts: label
0    13983
1     4792
Name: count, dtype: int64
DataFrame shape: (18775, 2)
First few rows:
                                               image  label
0     VEDAI_dataset\VEDAI_512\images\00000000_co.png      0
1     VEDAI_dataset\VEDAI_512\images\00000000_ir.png      0
2  VEDAI_dataset\VEDAI_512\images\00000000_ir_tri...      0
3  VEDAI_dataset\VEDAI_512\images\00000000_ir_tri...      0
4  VEDAI_dataset\VEDAI_512\images\00000000_ir_tri...      0


In [9]:
LABELLED_CSV = ROOT_DIR / "all_image_metrics.csv"

# Save raw metadata CSV (label only)
LABELLED_CSV.parent.mkdir(parents=True, exist_ok=True)
pd.DataFrame(original_data).to_csv(LABELLED_CSV, index=False)

# Compute metrics for original images and save
labelled_df = generate_label_csv(LABELLED_CSV, ROOT_DIR)
labelled_df.to_csv(LABELLED_CSV, index=False)
print("Saved original image metrics to:", LABELLED_CSV)

Computing image features: 100%|██████████| 18775/18775 [05:29<00:00, 56.90it/s]


Saved original image metrics to: C:\Users\rsriram3\Documents\ind_study\all_image_metrics.csv
