In [1]:
#!/usr/bin/env python3
"""
Re-organise .npy feature files so they sit alongside the matching
category / sub-category of their original JPEG image.

Source layout
└─ shoe_features_cpu/              ← *.npy files (e.g. abc.npy)
└─ shoes/
   ├─ <Category>/                  ← 4 top-level groups
   │   └─ <Subcategory>/           ← e.g. “AnkleBoots”
   │       └─ <Brand>/             ← many brand folders
   │           └─ abc.jpg

Target layout (created by this script)
└─ shoe_features_by_subcat/
   └─ <Category>/
       └─ <Subcategory>/
           └─ abc.npy
"""

from pathlib import Path
import shutil
import collections

# ── CONFIG ──────────────────────────────────────────────────────────────
SRC_NPY_DIR   = Path("shoe_features_cpu")
SRC_IMG_ROOT  = Path("shoes")
DST_ROOT      = Path("shoe_features_by_subcat")  # change if you like
# ────────────────────────────────────────────────────────────────────────

def build_image_index(img_root: Path):
    """
    Walk the shoes/ tree and map each image stem (abc) → relative path
    <Category>/<Subcategory>.
    """
    index = {}
    dups  = collections.defaultdict(list)

    for img in img_root.rglob("*.jpg"):
        stem = img.stem.lower()
        try:
            category, subcat = img.relative_to(img_root).parts[:2]
        except ValueError:
            # Not deep enough (shouldn’t happen, but be safe)
            continue
        rel_path = Path(category) / subcat
        if stem in index:
            # Same filename appears in multiple subcats – remember both
            dups[stem].append(rel_path)
        else:
            index[stem] = rel_path
    if dups:
        print(f"[WARN] {len(dups)} duplicate basenames found; "
              f"first occurrence wins.   Examples: {list(dups.items())[:3]}")
    return index

def main():
    img_index = build_image_index(SRC_IMG_ROOT)

    total   = 0
    matched = 0
    missing = []

    for npy in SRC_NPY_DIR.glob("*.npy"):
        total += 1
        stem = npy.stem.lower()
        rel  = img_index.get(stem)
        if rel is None:
            missing.append(npy.name)
            continue

        dest_dir = DST_ROOT / rel
        dest_dir.mkdir(parents=True, exist_ok=True)
        shutil.copy2(npy, dest_dir / npy.name)
        matched += 1

        # lightweight progress indicator
        if matched % 1000 == 0:
            print(f"  …{matched} files placed")

    print(f"\nDone!  Matched {matched}/{total} npys "
          f"({len(missing)} unmatched).")
    if missing:
        print("Unmatched examples:", missing[:10])

if __name__ == "__main__":
    main()


[WARN] 35 duplicate basenames found; first occurrence wins.   Examples: [('7990671.30937', [WindowsPath('Shoes/Heels')]), ('7990672.2078', [WindowsPath('Shoes/Heels')]), ('7990680.139', [WindowsPath('Shoes/Heels')])]
  …1000 files placed
  …2000 files placed
  …3000 files placed
  …4000 files placed
  …5000 files placed
  …6000 files placed
  …7000 files placed
  …8000 files placed
  …9000 files placed
  …10000 files placed
  …11000 files placed
  …12000 files placed
  …13000 files placed
  …14000 files placed
  …15000 files placed
  …16000 files placed
  …17000 files placed
  …18000 files placed
  …19000 files placed
  …20000 files placed
  …21000 files placed
  …22000 files placed
  …23000 files placed
  …24000 files placed
  …25000 files placed
  …26000 files placed
  …27000 files placed
  …28000 files placed
  …29000 files placed
  …30000 files placed
  …31000 files placed
  …32000 files placed
  …33000 files placed
  …34000 files placed
  …35000 files placed
  …36000 files placed


In [4]:
import os
import shutil
import json
from pathlib import Path

def flatten_brands(shoes_root):
    """
    shoes_root/
      └── <category>/
          └── <subcategory>/
              └── <brand>/
                  └── image.jpg
      
    After running, images live in:
      shoes_root/<category>/<subcategory>/<category>_<subcategory>_image.jpg
    and brand folders are removed.
    
    Returns a dict: { "category_subcategory_image.jpg": "brand", ... }
    """
    shoes_root = Path(shoes_root)
    mapping = {}

    # iterate categories
    for cat_dir in shoes_root.iterdir():
        if not cat_dir.is_dir():
            continue
        # iterate subcategories
        for sub_dir in cat_dir.iterdir():
            if not sub_dir.is_dir():
                continue
            # iterate brands
            for brand_dir in sub_dir.iterdir():
                if not brand_dir.is_dir():
                    continue
                brand = brand_dir.name
                # move each image up one level
                for img_path in brand_dir.iterdir():
                    if img_path.suffix.lower() not in {".jpg", ".jpeg", ".png"}:
                        continue
                    new_json_name = f"{cat_dir.name}_{sub_dir.name}_{img_path.name}"
                    new_name = f"{img_path.name}"
                    dest_path = sub_dir / new_name
                    # if collision, you can choose to overwrite or skip
                    if dest_path.exists():
                        print(f"⚠️ {dest_path.name} already exists—skipping")
                        continue
                    shutil.move(str(img_path), str(dest_path))
                    mapping[new_json_name] = brand

                # try to remove empty brand folder
                try:
                    brand_dir.rmdir()
                except OSError:
                    print(f"⚠️ Could not remove {brand_dir}, it may not be empty")

    return mapping

if __name__ == "__main__":
    shoes_dir = "shoes"      # ← change this to your actual path
    brand_map = flatten_brands(shoes_dir)

    # print out the mapping
    print("Mapping of new filenames → brand:")
    for fname, brand in brand_map.items():
        print(f"  {fname}  :  {brand}")

    # optionally save to JSON
    with open("brand_mapping.json", "w", encoding="utf-8") as f:
        json.dump(brand_map, f, indent=2, ensure_ascii=False)

    print("\nDone! All brand folders removed, images flattened, and mapping saved.")


⚠️ 7990671.30937.jpg already exists—skipping
⚠️ 7990672.2078.jpg already exists—skipping
⚠️ 7990680.139.jpg already exists—skipping
⚠️ 7990680.149.jpg already exists—skipping
⚠️ 8042667.151.jpg already exists—skipping
⚠️ 8042667.376834.jpg already exists—skipping
⚠️ 8042667.43903.jpg already exists—skipping
⚠️ 8042676.101867.jpg already exists—skipping
⚠️ 8042676.3.jpg already exists—skipping
⚠️ 8042676.568.jpg already exists—skipping
⚠️ 8042679.151.jpg already exists—skipping
⚠️ 8042679.4836.jpg already exists—skipping
⚠️ 8042696.147.jpg already exists—skipping
⚠️ 8042696.151.jpg already exists—skipping
⚠️ 8042697.11.jpg already exists—skipping
⚠️ 8042702.151.jpg already exists—skipping
⚠️ 8042707.151.jpg already exists—skipping
⚠️ 8042707.249058.jpg already exists—skipping
⚠️ 8042709.358984.jpg already exists—skipping
⚠️ 8062030.183092.jpg already exists—skipping
⚠️ 8062030.19429.jpg already exists—skipping
⚠️ 8062031.151.jpg already exists—skipping
⚠️ 8062031.742.jpg already exists—