In [1]:
# cell 1==================================
# 0) Config
# ==================================
from pathlib import Path
import random, shutil,  tarfile, re,  warnings

# Suppress all deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Reproducibility
SEED = 42
random.seed(SEED)

# Project root (this notebook's directory)
ROOT = Path(".").resolve()

# Path to images tarball in project root
TAR_PATH = ROOT / "images.tar"

# Candidate extraction locations (some tars have Images/, some just drop in root)
CANDIDATE_SOURCES = [
    ROOT / "Images",
    ROOT  # fallback if it extracts straight into root
]

# Chosen restricted breeds (slugs you want to separate out)
RESTRICTED_SLUGS = {
    "rottweiler",
    "doberman",
    "german_shepherd",
    "rhodesian_ridgeback",
    "staffordshire_bullterrier",
}

# Destination category roots (will be created in project root)
RESTRICTED_DIR = ROOT / "restricted"
UNRESTRICTED_DIR = ROOT / "unrestricted"

# Split config (train/val/test)
SPLITS = ("train", "val", "test")
RATIOS = {"train": 0.70, "val": 0.15, "test": 0.15}

# Valid image extensions
VALID_EXTS = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"}

print("Config loaded.")
print("Root:", ROOT)
print("Tar path:", TAR_PATH)
print("Restricted slugs:", sorted(RESTRICTED_SLUGS))


Config loaded.
Root: C:\Users\hp\ComputerVision
Tar path: C:\Users\hp\ComputerVision\images.tar
Restricted slugs: ['doberman', 'german_shepherd', 'rhodesian_ridgeback', 'rottweiler', 'staffordshire_bullterrier']


In [2]:
# cell 2==================================
# 1) Extract images.tar (idempotent)
# ==================================

def has_breed_folders():
    """Check if WordNet-style breed folders already exist."""
    for base in CANDIDATE_SOURCES:
        if base.exists():
            for p in base.iterdir():
                if p.is_dir() and "-" in p.name:
                    # e.g., 'n02107142-Doberman'
                    return True
    return False

if TAR_PATH.exists() and not has_breed_folders():
    print(f"Extracting {TAR_PATH.name} into {ROOT} ...")
    with tarfile.open(TAR_PATH, "r:*") as tf:
        tf.extractall(ROOT)
    print("Extraction complete.")
else:
    if not TAR_PATH.exists():
        print("No images.tar found — assuming files are already extracted.")
    else:
        print("Breed folders already present — skipping extraction.")


Breed folders already present — skipping extraction.


In [3]:
# cell 3==================================
# 2) Locate source breed folders & normalize names
# ==================================
import re

def normalize_breed_name(raw_name: str) -> str:
    """
    Convert folder name like 'n02106662-German_shepherd' -> 'german_shepherd'.
    If no '-', just lowercase and replace spaces.
    """
    if "-" in raw_name:
        breed_part = raw_name.split("-", 1)[1]
    else:
        breed_part = raw_name
    # normalize: trim, replace spaces with underscores, collapse repeats, lower
    breed_part = breed_part.strip().replace(" ", "_")
    breed_part = re.sub(r"_+", "_", breed_part)
    return breed_part.lower()

def find_source_breed_dirs():
    """Return list of (Path, normalized_slug)."""
    found = []
    for base in CANDIDATE_SOURCES:
        if not base.exists():
            continue
        for d in sorted(p for p in base.iterdir() if p.is_dir()):
            looks_like_wn = "-" in d.name
            has_images = any((f.suffix in VALID_EXTS) for f in d.glob("*"))
            if looks_like_wn or has_images:
                slug = normalize_breed_name(d.name)
                found.append((d, slug))
    return found

source_dirs = find_source_breed_dirs()
print(f"Found {len(source_dirs)} candidate breed folders.")
for i, (p, s) in enumerate(source_dirs[:8], 1):
    print(f"{i:>2}. {p.name}  ->  {s}")


Found 120 candidate breed folders.
 1. n02085620-Chihuahua  ->  chihuahua
 2. n02085782-Japanese_spaniel  ->  japanese_spaniel
 3. n02085936-Maltese_dog  ->  maltese_dog
 4. n02086079-Pekinese  ->  pekinese
 5. n02086240-Shih-Tzu  ->  shih-tzu
 6. n02086646-Blenheim_spaniel  ->  blenheim_spaniel
 7. n02086910-papillon  ->  papillon
 8. n02087046-toy_terrier  ->  toy_terrier


In [4]:
# --- CELL 4 START ---
# 3) Build ./restricted and ./unrestricted (clean, fresh)
from collections import defaultdict

# 1) Start clean so reruns don't duplicate
for d in (RESTRICTED_DIR, UNRESTRICTED_DIR):
    if d.exists():
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def copy_breed_images(src_dir: Path, slug: str, category_root: Path):
    """
    Copy all valid images from src_dir into category_root/slug
    with standardized names: slug_001.jpg, slug_002.jpg, ...
    """
    dest_breed_dir = category_root / slug
    ensure_dir(dest_breed_dir)
    idx = 0
    for img in sorted(p for p in src_dir.iterdir() if p.is_file() and p.suffix in VALID_EXTS):
        idx += 1
        new_name = f"{slug}_{idx:03d}{img.suffix.lower()}"  # 3-digit padding
        shutil.copy2(img, dest_breed_dir / new_name)
    return idx

moved_counts = defaultdict(int)  # (category, slug) -> count

for src_dir, slug in source_dirs:
    category_root = RESTRICTED_DIR if slug in RESTRICTED_SLUGS else UNRESTRICTED_DIR
    added = copy_breed_images(src_dir, slug, category_root)
    moved_counts[(category_root.name, slug)] += added

# ---- Report
tot_restricted = sum(cnt for (cat, _), cnt in moved_counts.items() if cat == "restricted")
tot_unrestricted = sum(cnt for (cat, _), cnt in moved_counts.items() if cat == "unrestricted")

print("Copy complete.")
print(f" - restricted images:   {tot_restricted}")
print(f" - unrestricted images: {tot_unrestricted}")

# Show a small sample
shown = 0
print("\nSample per-breed additions:")
for (cat, slug), cnt in moved_counts.items():
    print(f"  {cat:<13} {slug:<28} +{cnt}")
    shown += 1
    if shown >= 20:
        break
# --- CELL 4 END ---


Copy complete.
 - restricted images:   781
 - unrestricted images: 19798

Sample per-breed additions:
  unrestricted  chihuahua                    +151
  unrestricted  japanese_spaniel             +185
  unrestricted  maltese_dog                  +252
  unrestricted  pekinese                     +149
  unrestricted  shih-tzu                     +214
  unrestricted  blenheim_spaniel             +188
  unrestricted  papillon                     +196
  unrestricted  toy_terrier                  +172
  restricted    rhodesian_ridgeback          +172
  unrestricted  afghan_hound                 +239
  unrestricted  basset                       +175
  unrestricted  beagle                       +195
  unrestricted  bloodhound                   +187
  unrestricted  bluetick                     +171
  unrestricted  black-and-tan_coonhound      +159
  unrestricted  walker_hound                 +153
  unrestricted  english_foxhound             +157
  unrestricted  redbone                      +14

In [14]:
# --- REWRITTEN CELL 5: Balanced 5/2/2 split for unrestricted breeds ---

from pathlib import Path
import random, shutil
from collections import defaultdict

# Reproducibility
random.seed(42)

DEST = {
    "train": {"restricted": ROOT / "train" / "restricted", "unrestricted": ROOT / "train" / "unrestricted"},
    "val":   {"restricted": ROOT / "val" / "restricted",   "unrestricted": ROOT / "val" / "unrestricted"},
    "test":  {"restricted": ROOT / "test" / "restricted",  "unrestricted": ROOT / "test" / "unrestricted"},
}

UNRESTRICTED_COUNTS = {"train": 5, "val": 1, "test": 1}

def reset_split_dirs():
    for split in DEST:
        for cat in DEST[split]:
            d = DEST[split][cat]
            if d.exists():
                shutil.rmtree(d)
            d.mkdir(parents=True, exist_ok=True)

def gather_breed_dirs(cat_root: Path):
    return [d for d in sorted(cat_root.iterdir()) if d.is_dir()]

def split_counts(n):
    n_train = int(n * RATIOS["train"])
    n_val   = int(n * RATIOS["val"])
    n_test  = n - n_train - n_val
    return n_train, n_val, n_test

def safe_copy(src: Path, dst_dir: Path, fname: str):
    target = dst_dir / fname
    if target.exists():
        stem, suff = target.stem, target.suffix
        k = 2
        while True:
            cand = dst_dir / f"{stem}__{k}{suff}"
            if not cand.exists():
                target = cand
                break
            k += 1
    shutil.copy2(src, target)
    return target

reset_split_dirs()

idx_map = {"restricted": defaultdict(int), "unrestricted": defaultdict(int)}
summary = {s: {"restricted": 0, "unrestricted": 0} for s in SPLITS}
per_breed_split = {s: defaultdict(int) for s in SPLITS}

# --- RESTRICTED ---
for breed_dir in gather_breed_dirs(RESTRICTED_DIR):
    slug = breed_dir.name
    images = [p for p in breed_dir.iterdir() if p.is_file() and p.suffix.lower() in VALID_EXTS]
    if not images:
        continue
    random.shuffle(images)
    n = len(images)
    n_train, n_val, n_test = split_counts(n)
    splits = {
        "train": images[:n_train],
        "val":   images[n_train:n_train+n_val],
        "test":  images[n_train+n_val:],
    }
    for split, files in splits.items():
        for src in files:
            idx_map["restricted"][slug] += 1
            new_name = f"{slug}_{idx_map['restricted'][slug]:03d}{src.suffix.lower()}"
            safe_copy(src, DEST[split]["restricted"], new_name)
            summary[split]["restricted"] += 1
            per_breed_split[split][slug] += 1

# --- UNRESTRICTED ---
for breed_dir in gather_breed_dirs(UNRESTRICTED_DIR):
    slug = breed_dir.name
    images = [p for p in breed_dir.iterdir() if p.is_file() and p.suffix.lower() in VALID_EXTS]
    if len(images) < sum(UNRESTRICTED_COUNTS.values()):
        continue  # skip breeds with too few samples
    random.shuffle(images)
    offset = 0
    for split in ("train", "val", "test"):
        count = UNRESTRICTED_COUNTS[split]
        for i in range(count):
            src = images[offset + i]
            idx_map["unrestricted"][slug] += 1
            new_name = f"{slug}_{idx_map['unrestricted'][slug]:03d}{src.suffix.lower()}"
            safe_copy(src, DEST[split]["unrestricted"], new_name)
            summary[split]["unrestricted"] += 1
            per_breed_split[split][slug] += 1
        offset += count

print("Split complete.")
for split in SPLITS:
    r = summary[split]["restricted"]
    u = summary[split]["unrestricted"]
    print(f"{split:<5} | restricted: {r:5d} | unrestricted: {u:5d} | total: {r+u:5d}")


Split complete.
train | restricted:   545 | unrestricted:   575 | total:  1120
val   | restricted:   114 | unrestricted:   115 | total:   229
test  | restricted:   122 | unrestricted:   115 | total:   237


In [15]:
# --- CELL 6 START ---
# Balance summary + per-breed counts (parses breed from filenames)

import re
from collections import defaultdict

# parse breed from filename like "doberman_023.jpg"
BREED_RE = re.compile(r"^(?P<breed>.+)_(?P<idx>\d{3})\.[A-Za-z0-9]+$")

def list_images(p: Path):
    return [f for f in p.glob("*") if f.is_file() and f.suffix.lower() in {".jpg", ".jpeg", ".png"}]

def count_per_breed(dir_path: Path):
    counts = defaultdict(int)
    for f in list_images(dir_path):
        m = BREED_RE.match(f.name)
        if m:
            counts[m.group("breed")] += 1
        else:
            # fallback: everything before last underscore (handles any padding length)
            stem = f.stem
            if "_" in stem:
                breed = "_".join(stem.split("_")[:-1])  # drop trailing index
            else:
                breed = stem
            counts[breed] += 1
    return dict(counts)

def print_table(rows, headers):
    # simple fixed-width table for console
    col_w = [max(len(str(h)), *(len(str(r[i])) for r in rows)) for i, h in enumerate(headers)]
    fmt = "  " + " | ".join("{:<" + str(w) + "}" for w in col_w)
    print(fmt.format(*headers))
    print("  " + "-+-".join("-" * w for w in col_w))
    for r in rows:
        print(fmt.format(*r))

# 1) Overall balance per split
print("=== Overall balance per split (restricted = positive, unrestricted = negative) ===")
overall_rows = []
for split in SPLITS:
    pos = len(list_images(DEST[split]["restricted"]))
    neg = len(list_images(DEST[split]["unrestricted"]))
    total = pos + neg
    ratio = f"{neg}:{pos}" if pos > 0 else "∞:0"
    pct_pos = (pos / total * 100) if total else 0
    pct_neg = (neg / total * 100) if total else 0
    overall_rows.append([split, pos, neg, total, f"{pct_pos:5.1f}%", f"{pct_neg:5.1f}%", ratio])
print_table(overall_rows, ["split", "positives", "negatives", "total", "%pos", "%neg", "neg:pos"])

# 2) Per-breed counts within each split (restricted + unrestricted shown together)
for split in SPLITS:
    print(f"\n=== Per-breed counts in {split} ===")
    per_breed = defaultdict(int)

    # accumulate from restricted
    rb = count_per_breed(DEST[split]["restricted"])
    for breed, n in rb.items():
        per_breed[breed] += n
    # accumulate from unrestricted
    ub = count_per_breed(DEST[split]["unrestricted"])
    for breed, n in ub.items():
        per_breed[breed] += n

    # build rows with category breakdown if you want to see pos/neg separately too
    rows = []
    for breed in sorted(per_breed.keys()):
        pos_n = rb.get(breed, 0)
        neg_n = ub.get(breed, 0)
        rows.append([breed, pos_n, neg_n, pos_n + neg_n])
    print_table(rows, ["breed_slug", "pos(restricted)", "neg(unrestricted)", "total"])

# --- CELL 6 END ---


=== Overall balance per split (restricted = positive, unrestricted = negative) ===
  split | positives | negatives | total | %pos   | %neg   | neg:pos
  ------+-----------+-----------+-------+--------+--------+--------
  train | 545       | 575       | 1120  |  48.7% |  51.3% | 575:545
  val   | 114       | 115       | 229   |  49.8% |  50.2% | 115:114
  test  | 122       | 115       | 237   |  51.5% |  48.5% | 115:122

=== Per-breed counts in train ===
  breed_slug                     | pos(restricted) | neg(unrestricted) | total
  -------------------------------+-----------------+-------------------+------
  affenpinscher                  | 0               | 5                 | 5    
  afghan_hound                   | 0               | 5                 | 5    
  african_hunting_dog            | 0               | 5                 | 5    
  airedale                       | 0               | 5                 | 5    
  american_staffordshire_terrier | 0               | 5              