In [19]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import os
import glob

In [13]:
# === Config ===
ROOT_DIR = Path("C:/Users/rsriram3/Documents/ind_study")
OUTPUT_DIR = ROOT_DIR / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_OUT = OUTPUT_DIR / "train_split.csv"
TEST_OUT = OUTPUT_DIR / "test_split.csv"

# Image folders
VEDAI_1024 = ROOT_DIR / "VEDAI_dataset/VEDAI_1024/images"
VEDAI_512 = ROOT_DIR / "VEDAI_dataset/VEDAI_512/images"
SHHA = ROOT_DIR / "ShanghaiTech Data/SHHA/images"
SHHB = ROOT_DIR / "ShanghaiTech Data/SHHB/images"


In [22]:
OUTPUT_FILE = Path(OUTPUT_DIR / 'labelled_images.csv')
SHUFFLED_FILE = Path(OUTPUT_DIR / 'shuffled_labelled_images.csv')

In [23]:
OUTPUT_FILE

WindowsPath('C:/Users/rsriram3/Documents/ind_study/data/labelled_images.csv')

In [24]:
VEDAI_PATHS = [
    "VEDAI_dataset/VEDAI_512/images",
    "VEDAI_dataset/VEDAI_1024/images"
]

SHH_PATHS = [
    "ShanghaiTech Data/SHHA/images",
    "ShanghaiTech Data/SHHB/images"
]

In [25]:
# Label Images
def label_image(v_paths=VEDAI_PATHS, sh_paths=SHH_PATHS, root_dir=ROOT_DIR):
    """
    Collects image paths and assigns labels: 0 for VEDAI, 1 for SHHA/SHHB.
    Joins paths with root_dir. Returns DataFrame with columns: image, label.
    """
    data = []

    # For SHHA / SHHB
    for path in sh_paths:
        full_path = os.path.join(root_dir, path)
        images = glob.glob(os.path.join(full_path, '*'))
        for img in images:
            rel_path = os.path.relpath(img, root_dir)
            data.append((rel_path, 1))  # Label 1 for SHHA/SHHB

    # For VEDAI
    for path in v_paths:
        full_path = os.path.join(root_dir, path)
        images = glob.glob(os.path.join(full_path, '*'))
        for img in images:
            rel_path = os.path.relpath(img, root_dir)
            data.append((rel_path, 0))  # Label 0 for VEDAI

    return pd.DataFrame(data, columns=["image", "label"])

In [None]:
# Saving Labeled DataFrame to CSV
def save_label_csv(df, output_file=OUTPUT_FILE):
    df.to_csv(output_file, index=False)
    print(f"Saved CSV with {len(df)} entries at: {output_file}")

In [14]:
# === Collect image paths and assign labels ===
def collect_images(folder: Path, label: int):
    return [{'image': str(p.relative_to(ROOT_DIR)), 'label': label} for p in sorted(folder.glob("*.png"))]

In [None]:
import os
import glob
import pandas as pd
from pathlib import Path

# === Config (already provided in your script) ===
ROOT_DIR = Path("C:/Users/rsriram3/Documents/ind_study")
OUTPUT_DIR = ROOT_DIR / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_OUT = OUTPUT_DIR / "train_split.csv"
TEST_OUT = OUTPUT_DIR / "test_split.csv"
OUTPUT_FILE = OUTPUT_DIR / "labelled_images.csv"
SHUFFLED_FILE = OUTPUT_DIR / "shuffled_labelled_images.csv"

VEDAI_PATHS = [ 
    "VEDAI_dataset/VEDAI_512/images",
    "VEDAI_dataset/VEDAI_1024/images"
]

SHH_PATHS = [
    "ShanghaiTech Data/SHHA/images",
    "ShanghaiTech Data/SHHB/images"
]

In [None]:
# === Function to label all images ===
def label_images(v_paths=VEDAI_PATHS, sh_paths=SHH_PATHS, root_dir=ROOT_DIR):
    """
    Collect image paths and assign labels:
    - 0 for VEDAI (vehicle detection)
    - 1 for SHH (crowd detection)
    Returns a DataFrame with columns: image (relative path), label
    """
    data = []

    # Label SHHA / SHHB images
    for path in sh_paths:
        full_path = root_dir / path
        images = glob.glob(str(full_path / "*.png")) + glob.glob(str(full_path / "*.jpg"))
        for img in images:
            rel_path = os.path.relpath(img, root_dir)
            data.append((rel_path, 1))  # Label 1 for SHH

    # Label VEDAI images
    for path in v_paths:
        full_path = root_dir / path
        images = glob.glob(str(full_path / "*"))
        for img in images:
            rel_path = os.path.relpath(img, root_dir)
            data.append((rel_path, 0))  # Label 0 for VEDAI

    return pd.DataFrame(data, columns=["image", "label"])


In [35]:
# === Save the labeled dataframe ===
def save_label_csv(df, output_file=OUTPUT_FILE):
    df.to_csv(output_file, index=False)
    print(f"✅ Saved {len(df)} labeled entries to: {output_file}")

In [36]:
df_labeled = label_images()
save_label_csv(df_labeled)

✅ Saved 4156 labeled entries to: C:\Users\rsriram3\Documents\ind_study\data\labelled_images.csv


In [37]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Load CSV ===
csv_path = Path("C:/Users/rsriram3/Documents/ind_study/data/labelled_images.csv")
df = pd.read_csv(OUTPUT_FILE)

# === Set root directory (adjust as per actual Colab folder structure) ===
ROOT_DIR = Path("C:/Users/rsriram3/Documents/ind_study")  # You can change this if the image folders are nested

# === Check if all image files exist ===
missing_files = []

print("Checking if image files exist...\n")
for rel_path in tqdm(df['image'], desc="Verifying images"):
    image_path = ROOT_DIR / rel_path
    if not image_path.exists():
        missing_files.append(str(image_path))

# === Report missing files ===
if missing_files:
    print(f"\n❌ {len(missing_files)} missing image(s):")
    for path in missing_files[:10]:  # show a sample
        print(" -", path)
else:
    print("\n✅ All image files found!")


Checking if image files exist...



Verifying images: 100%|██████████| 4156/4156 [00:00<00:00, 38073.99it/s]


✅ All image files found!





In [38]:
class_0 = df[df['label'] == 0]
class_1 = df[df['label'] == 1]

In [None]:
class_0, class_1

In [44]:
from sklearn.utils import shuffle

def sample_data(df, vedai_train=2000, shh_train=1000, shh_test=198, seed=42):
    """
    Ideal split:
    - Train: 2000 VEDAI, 1000 SHH
    - Test: rest of VEDAI(958) + 198 SHH
    """
    df = shuffle(df, random_state=seed)

    vedai_df = df[df['label'] == 0]
    shh_df = df[df['label'] == 1]

    assert len(vedai_df) >= vedai_train
    assert len(shh_df) >= shh_train + shh_test

    vedai_train_df = vedai_df.iloc[:vedai_train]
    vedai_test_df = vedai_df.iloc[vedai_train:]  # Remaining ~2111

    shh_train_df = shh_df.iloc[:shh_train]
    shh_test_df = shh_df.iloc[shh_train:shh_train + shh_test]

    train_df = pd.concat([vedai_train_df, shh_train_df]).reset_index(drop=True)
    test_df = pd.concat([vedai_test_df, shh_test_df]).reset_index(drop=True)

    train_df = shuffle(train_df, random_state=seed).reset_index(drop=True)
    test_df = shuffle(test_df, random_state=seed).reset_index(drop=True)

    return train_df, test_df

In [54]:
def save_train_test(train_df, test_df, output_dir='.'):
    train_csv = os.path.join(output_dir, 'train_split.csv')
    test_csv = os.path.join(output_dir, 'test_split.csv')

    # Save only 'image' and 'label' columns (with relative paths)
    train_df[['image', 'label']].to_csv(train_csv, index=False)
    test_df[['image', 'label']].to_csv(test_csv, index=False)

    print(f"Train set saved as CSV : {train_csv} ({len(train_df)} samples)")
    print(f"Test set saved as CSV: {test_csv} ({len(test_df)} samples)")

In [46]:
train_df, test_df = sample_data(df)

In [50]:
save_train_test(train_df, test_df, output_dir=OUTPUT_DIR)

Train set saved as CSV : C:\Users\rsriram3\Documents\ind_study\data\train_split.csv (3000 samples)
Test set saved as CSV: C:\Users\rsriram3\Documents\ind_study\data\test_split.csv (1156 samples)


In [52]:
train_df['image'] = train_df['image'].str.replace('.png2.png', '.png', regex=False)
test_df['image'] = test_df['image'].str.replace('.png2.png', '.png', regex=False)

In [53]:
save_train_test(train_df, test_df, output_dir=OUTPUT_DIR)

Train set saved as CSV : C:\Users\rsriram3\Documents\ind_study\data\train_split_x.csv (3000 samples)
Test set saved as CSV: C:\Users\rsriram3\Documents\ind_study\data\test_split_x.csv (1156 samples)
