In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os, json, shutil
from pathlib import Path


In [13]:
IMG_DIR    = Path("/content/drive/MyDrive/Data_photos/photos")        # folder with your images
JSON_FILE  = Path("/content/drive/MyDrive/sampled_posts.json") # your JSON metadata
OUT_DIR    = Path("/content/drive/MyDrive/Data_photos/Images_reels")  # where to save Reels images
TARGET_CT  = "Reel"

In [5]:
def get_post_id_from_filename(fname: str) -> str | None:
    """Extract <post_id> from a filename like '<post_id>__anything.ext'"""
    base = os.path.basename(fname)
    if "__" not in base:
        return None
    return base.split("__", 1)[0]

In [11]:
with open(JSON_FILE, "r") as f:
    posts = json.load(f)

allowed_ids = {
    str(item.get("post_id") or item.get("pk") or "").strip()
    for item in posts
    if (item.get("content_type") or "").strip().lower() == TARGET_CT.lower()
}

print(f"Found {len(allowed_ids)} post_ids with content_type == '{TARGET_CT}'")

Found 128 post_ids with content_type == 'Reel'


In [14]:

OUT_DIR.mkdir(parents=True, exist_ok=True)

kept = skipped = missing = 0

for p in IMG_DIR.rglob("*"):   # includes nested folders if any
    if not p.is_file():
        continue

    pid = get_post_id_from_filename(p.name)
    if not pid:
        skipped += 1
        continue

    if pid in allowed_ids:
        shutil.copy2(p, OUT_DIR / p.name)   # use shutil.move(...) to move instead of copy
        kept += 1
    else:
        missing += 1

print("\n=== SUMMARY ===")
print(f"Kept (Reels only): {kept}")
print(f"Skipped (bad name / no '__'): {skipped}")
print(f"Not Reels: {missing}")
print(f"Output folder: {OUT_DIR}")



=== SUMMARY ===
Kept (Reels only): 124
Skipped (bad name / no '__'): 0
Not Reels: 6747
Output folder: /content/drive/MyDrive/Data_photos/Images_reels
