From c6d16ed0ca980faa0bfa540cf81fc454a7d5482a Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Sat, 17 May 2025 22:49:52 +0200
Subject: [PATCH 01/10] added features: organized usage, e621_tag_failed tag

---
 plugins/e621_tagger/e621_tagger.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index 4b66bdb4..d0aa528d 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -54,6 +54,10 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
     if any(tag["name"] == "e621_tagged" for tag in image.get("tags", [])):
         return
 
+
+    if any(tag["name"] == "e621_tag_failed" for tag in image.get("tags", [])):
+        return
+
     try:
         time.sleep(2)  # Rate limiting
         response = requests.get(
@@ -64,7 +68,18 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         response.raise_for_status()
         post_data = response.json().get("post", {})
     except Exception as e:
-        log.error(f"e621 API error: {str(e)}")
+        log.error(f"Marking as failed. e621 API error: {str(e)}")
+        e621_tag_failed_tag = get_or_create_tag(stash, "e621_tag_failed")
+        image_tags_on_e621_fail_ids = [e621_tag_failed_tag["id"]]
+
+        for tag in image.get("tags"):
+            image_tags_on_e621_fail_ids.append(tag["id"])
+       
+        stash.update_image({
+            "id": image_id,
+            "tag_ids": list(set(image_tags_on_e621_fail_ids))
+        })
+
         return
 
     if not post_data:
@@ -104,6 +119,7 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
     try:
         stash.update_image({
             "id": image_id,
+            "organized": True,
             "urls": [post_url],
             "tag_ids": list(set(tag_ids)),
             "studio_id": studio_id,
@@ -115,6 +131,7 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         log.error(f"Update failed: {str(e)}")
 
 
+
 def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
     """Find or create tag with hierarchy handling"""
     # Validate tag name
@@ -205,12 +222,13 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
 # Plugin setup and execution
 # In the main execution block:
 if __name__ == "__main__":
+    log.info("Starting tagger...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
     config = stash.get_configuration().get("plugins", {})
     settings = {
-        "SkipTags": "e621_tagged",  # Add automatic filtering
+        "SkipTags": "e621_tagged, e621_tag_failed",  # Add automatic filtering
         "ExcludeOrganized": False
     }
     settings.update(config.get("e621_tagger", {}))
@@ -218,13 +236,17 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
     log.info(settings)
 
     # Get e621_tagged ID for filtering
-    e621_tag = get_or_create_tag(stash, "e621_tagged")
+    e621_tagged_tag = get_or_create_tag(stash, "e621_tagged")
+    e621_tag_failed_tag = get_or_create_tag(stash, "e621_tag_failed")
 
     # Existing tags + automatic e621_tagged exclusion
     skip_tags = [t.strip() for t in settings["SkipTags"].split(",") if t.strip()]
-    skip_tags.append(e621_tag["id"])  # Filter by ID instead of name
+    skip_tags.append(e621_tagged_tag["id"])  # Filter by ID instead of name
+    skip_tags.append(e621_tag_failed_tag["id"])  # Filter by ID instead of name
 
+    log.info("Getting images...")
     images = get_all_images(stash, skip_tags, settings["ExcludeOrganized"])
+    log.info(f"Got ${str(len(images))} images");
 
     # Rest of the loop remains the same
     for i, image in enumerate(images, 1):

From fe0031b77a0c3143c5e047adf614e3506ab7b3ad Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Sat, 17 May 2025 23:18:02 +0200
Subject: [PATCH 02/10] prevent OOM on big databases

---
 plugins/e621_tagger/e621_tagger.py | 159 ++++++++++++-----------------
 1 file changed, 63 insertions(+), 96 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index d0aa528d..f5bd37b7 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -8,58 +8,61 @@
 from stashapi.stashapp import StashInterface
 
 
-
 def get_all_images(
-    client: StashInterface, 
-    skip_tags: list[str], 
-    exclude_organized: bool
+    client: StashInterface,
+    skip_tags: list[int],
+    exclude_organized: bool,
+    per_page: int = 100,
 ) -> list[dict]:
     """
-    Get all images with proper tag exclusion and organization filter
+    Generator to fetch images in pages from the stash API.
     """
-    image_filter = {}
-    pagination = {
-        "page": 1,
-        "per_page": -1,  # -1 gets all results at once
-        "sort": "created_at",
-        "direction": "ASC",
-    }
-
-    # Convert tag names to IDs
-    tag_ids = []
-    for tag_name in skip_tags:
-        tag = get_or_create_tag(client, tag_name)
-        if tag:
-            tag_ids.append(tag["id"])
-
-    if tag_ids:
-        image_filter["tags"] = {
-            "value": [],
-            "excludes": tag_ids,
-            "modifier": "INCLUDES_ALL",
-            "depth": -1,
+    page = 1
+    while True:
+        image_filter = {}
+        pagination = {
+            "page": page,
+            "per_page": per_page,
+            "sort": "created_at",
+            "direction": "ASC",
         }
 
-    if exclude_organized:
-        image_filter["organized"] = False  # Correct field name
+        if skip_tags:
+            image_filter["tags"] = {
+                "value": [],
+                "excludes": skip_tags,
+                "modifier": "INCLUDES_ALL",
+                "depth": -1,
+            }
+
+        if exclude_organized:
+            image_filter["organized"] = False
+
+        images = client.find_images(f=image_filter, filter=pagination)
+        if not images:
+            # no more pages
+            break
+
+        log.info(f"Fetched page {page} with {len(images)} images")
+        for img in images:
+            yield img
 
-    # Maintain original parameter structure
-    return client.find_images(f=image_filter, filter=pagination)
+        # move to next page
+        page += 1
 
 
 def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> None:
     """Process e621 metadata and update Stash records"""
-    # Skip already processed images
+    # same as before...
     image = stash.find_image(image_id)
-    if any(tag["name"] == "e621_tagged" for tag in image.get("tags", [])):
+    if any(t["name"] == "e621_tagged" for t in image.get("tags", [])):
         return
 
-
-    if any(tag["name"] == "e621_tag_failed" for tag in image.get("tags", [])):
+    if any(t["name"] == "e621_tag_failed" for t in image.get("tags", [])):
         return
 
     try:
-        time.sleep(2)  # Rate limiting
+        time.sleep(0.5)
         response = requests.get(
             f"https://e621.net/posts.json?md5={image_md5}",
             headers={"User-Agent": "Stash-e621-Tagger/1.0"},
@@ -69,53 +72,38 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         post_data = response.json().get("post", {})
     except Exception as e:
         log.error(f"Marking as failed. e621 API error: {str(e)}")
-        e621_tag_failed_tag = get_or_create_tag(stash, "e621_tag_failed")
-        image_tags_on_e621_fail_ids = [e621_tag_failed_tag["id"]]
-
-        for tag in image.get("tags"):
-            image_tags_on_e621_fail_ids.append(tag["id"])
-       
-        stash.update_image({
-            "id": image_id,
-            "tag_ids": list(set(image_tags_on_e621_fail_ids))
-        })
-
+        e621_tag_failed = get_or_create_tag(stash, "e621_tag_failed")
+        fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in image.get("tags", [])]
+        stash.update_image({"id": image_id, "tag_ids": list(set(fail_ids))})
         return
 
     if not post_data:
         return
 
-    # Create essential entities
     e621_tag = get_or_create_tag(stash, "e621_tagged")
     post_url = f"https://e621.net/posts/{post_data['id']}"
 
-    # Process tags
     tag_ids = [e621_tag["id"]]
-    for category in ["general", "species", "character", "artist", "copyright"]:
-        for tag in post_data.get("tags", {}).get(category, []):
-            # Clean and validate tag
+    for cat in ["general", "species", "character", "artist", "copyright"]:
+        for tag in post_data.get("tags", {}).get(cat, []):
             clean_tag = tag.strip()
             if not clean_tag:
                 continue
-            
             stash_tag = get_or_create_tag(stash, clean_tag)
             if stash_tag:
                 tag_ids.append(stash_tag["id"])
 
-    # Process studio
     studio_id = None
     if artists := post_data.get("tags", {}).get("artist"):
         studio = get_or_create_studio(stash, artists[0])
         studio_id = studio["id"]
 
-    # Process performers
     performer_ids = []
-    for char_tag in post_data.get("tags", {}).get("character", []):
-        performer_name = char_tag.split('_(')[0]
-        performer = get_or_create_performer(stash, performer_name)
-        performer_ids.append(performer["id"])
+    for char in post_data.get("tags", {}).get("character", []):
+        name = char.split('_(')[0]
+        perf = get_or_create_performer(stash, name)
+        performer_ids.append(perf["id"])
 
-    # Update image
     try:
         stash.update_image({
             "id": image_id,
@@ -125,13 +113,11 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
             "studio_id": studio_id,
             "performer_ids": performer_ids
         })
-
-        log.info("Image updated: ${image_id}")
+        log.info(f"Image updated: {image_id}")
     except Exception as e:
         log.error(f"Update failed: {str(e)}")
 
 
-
 def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
     """Find or create tag with hierarchy handling"""
     # Validate tag name
@@ -183,6 +169,7 @@ def get_or_create_performer(stash: StashInterface, name: str) -> dict:
 
 def scrape_image(client: StashInterface, image_id: str) -> None:
     """Main scraping handler"""
+    # same logic as before for MD5 extraction and process_e621_post call
     image = client.find_image(image_id)
     if not image or not image.get("visual_files"):
         return
@@ -190,70 +177,50 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
     file_data = image["visual_files"][0]
     filename = file_data["basename"]
     filename_md5 = filename.split('.')[0]
-    final_md5 = None
 
-    # First try filename-based MD5
     if re.match(r"^[a-f0-9]{32}$", filename_md5):
         final_md5 = filename_md5
         log.info(f"Using filename MD5: {final_md5}")
     else:
-        # Fallback to content-based MD5
         try:
-            file_path = file_data["path"]
-            log.info(f"Generating MD5 from file content: {file_path}")
-            
             md5_hash = hashlib.md5()
-            with open(file_path, "rb") as f:
-                # Read file in 64kb chunks for memory efficiency
+            with open(file_data["path"], "rb") as f:
                 for chunk in iter(lambda: f.read(65536), b""):
                     md5_hash.update(chunk)
-            
             final_md5 = md5_hash.hexdigest()
             log.info(f"Generated content MD5: {final_md5}")
         except Exception as e:
             log.error(f"Failed to generate MD5: {str(e)}")
             return
 
-    if final_md5:
-        process_e621_post(client, image_id, final_md5)
-    else:
-        log.warning("No valid MD5 available for processing")
+    process_e621_post(client, image_id, final_md5)
+
 
-# Plugin setup and execution
-# In the main execution block:
 if __name__ == "__main__":
-    log.info("Starting tagger...")
+    log.info("Starting tagger with pagination...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
     config = stash.get_configuration().get("plugins", {})
     settings = {
-        "SkipTags": "e621_tagged, e621_tag_failed",  # Add automatic filtering
+        "SkipTags": "e621_tagged, e621_tag_failed",
         "ExcludeOrganized": False
     }
     settings.update(config.get("e621_tagger", {}))
 
-    log.info(settings)
+    e621_tagged = get_or_create_tag(stash, "e621_tagged")
+    e621_failed = get_or_create_tag(stash, "e621_tag_failed")
 
-    # Get e621_tagged ID for filtering
-    e621_tagged_tag = get_or_create_tag(stash, "e621_tagged")
-    e621_tag_failed_tag = get_or_create_tag(stash, "e621_tag_failed")
-
-    # Existing tags + automatic e621_tagged exclusion
     skip_tags = [t.strip() for t in settings["SkipTags"].split(",") if t.strip()]
-    skip_tags.append(e621_tagged_tag["id"])  # Filter by ID instead of name
-    skip_tags.append(e621_tag_failed_tag["id"])  # Filter by ID instead of name
-
-    log.info("Getting images...")
-    images = get_all_images(stash, skip_tags, settings["ExcludeOrganized"])
-    log.info(f"Got ${str(len(images))} images");
+    skip_tags = [st for st in skip_tags]
+    skip_tags.extend([e621_tagged["id"], e621_failed["id"]])
 
-    # Rest of the loop remains the same
-    for i, image in enumerate(images, 1):
-        image_tag_names = [tag["name"] for tag in image.get("tags", [])]
-        if any(tag in image_tag_names for tag in skip_tags):
+    log.info("Fetching images in pages...")
+    for idx, image in enumerate(get_all_images(stash, skip_tags, settings["ExcludeOrganized"], per_page=100), start=1):
+        current_tags = [t["name"] for t in image.get("tags", [])]
+        if any(t in current_tags for t in skip_tags):
             log.info(f"Skipping image {image['id']} - contains skip tag")
             continue
 
-        log.progress(i/len(images))
+        log.progress(idx)
         scrape_image(stash, image["id"])

From 4d6bae61140ccc1089d24bcaeb46f24a185cc3ed Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Sat, 17 May 2025 23:22:23 +0200
Subject: [PATCH 03/10] Update e621_tagger.yml

---
 plugins/e621_tagger/e621_tagger.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/e621_tagger/e621_tagger.yml b/plugins/e621_tagger/e621_tagger.yml
index 3ad01863..9b8ecc28 100644
--- a/plugins/e621_tagger/e621_tagger.yml
+++ b/plugins/e621_tagger/e621_tagger.yml
@@ -1,6 +1,6 @@
 name: e621_tagger
 description: Finding images and videos on e621 and tagging them.
-version: 0.1
+version: 0.2
 url: https://github.com/stashapp/CommunityScripts/
 exec:
   - python

From facaf75bb6002ab16508fe5a743f5b406cf17281 Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Thu, 11 Sep 2025 21:01:52 +0200
Subject: [PATCH 04/10] e621_tagger - fixed pagination and progress bar

---
 plugins/e621_tagger/e621_tagger.py | 103 ++++++++++++++++-------------
 1 file changed, 56 insertions(+), 47 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index f5bd37b7..18754433 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -10,14 +10,15 @@
 
 def get_all_images(
     client: StashInterface,
-    skip_tags: list[int],
+    skip_tag_ids: list[int],
     exclude_organized: bool,
     per_page: int = 100,
 ) -> list[dict]:
     """
-    Generator to fetch images in pages from the stash API.
+    Fetch all images (returns a stable list snapshot). Uses numeric tag IDs in skip_tag_ids.
     """
     page = 1
+    all_images = []
     while True:
         image_filter = {}
         pagination = {
@@ -27,10 +28,10 @@ def get_all_images(
             "direction": "ASC",
         }
 
-        if skip_tags:
+        if skip_tag_ids:
             image_filter["tags"] = {
                 "value": [],
-                "excludes": skip_tags,
+                "excludes": skip_tag_ids,
                 "modifier": "INCLUDES_ALL",
                 "depth": -1,
             }
@@ -40,20 +41,16 @@ def get_all_images(
 
         images = client.find_images(f=image_filter, filter=pagination)
         if not images:
-            # no more pages
             break
 
         log.info(f"Fetched page {page} with {len(images)} images")
-        for img in images:
-            yield img
-
-        # move to next page
+        all_images.extend(images)
         page += 1
 
+    return all_images
+
 
 def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> None:
-    """Process e621 metadata and update Stash records"""
-    # same as before...
     image = stash.find_image(image_id)
     if any(t["name"] == "e621_tagged" for t in image.get("tags", [])):
         return
@@ -66,7 +63,7 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         response = requests.get(
             f"https://e621.net/posts.json?md5={image_md5}",
             headers={"User-Agent": "Stash-e621-Tagger/1.0"},
-            timeout=10
+            timeout=10,
         )
         response.raise_for_status()
         post_data = response.json().get("post", {})
@@ -100,27 +97,27 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
 
     performer_ids = []
     for char in post_data.get("tags", {}).get("character", []):
-        name = char.split('_(')[0]
+        name = char.split("_(")[0]
         perf = get_or_create_performer(stash, name)
         performer_ids.append(perf["id"])
 
     try:
-        stash.update_image({
-            "id": image_id,
-            "organized": True,
-            "urls": [post_url],
-            "tag_ids": list(set(tag_ids)),
-            "studio_id": studio_id,
-            "performer_ids": performer_ids
-        })
+        stash.update_image(
+            {
+                "id": image_id,
+                "organized": True,
+                "urls": [post_url],
+                "tag_ids": list(set(tag_ids)),
+                "studio_id": studio_id,
+                "performer_ids": performer_ids,
+            }
+        )
         log.info(f"Image updated: {image_id}")
     except Exception as e:
         log.error(f"Update failed: {str(e)}")
 
 
 def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
-    """Find or create tag with hierarchy handling"""
-    # Validate tag name
     tag_name = tag_name.strip()
     if not tag_name:
         log.error("Attempted to create tag with empty name")
@@ -129,15 +126,17 @@ def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
     existing = stash.find_tags(f={"name": {"value": tag_name, "modifier": "EQUALS"}})
     if existing:
         return existing[0]
-    
+
     parts = tag_name.split(":")
     parent_id = None
     for i in range(len(parts)):
-        current_name = ":".join(parts[:i+1]).strip()
+        current_name = ":".join(parts[: i + 1]).strip()
         if not current_name:
             continue
-            
-        existing = stash.find_tags(f={"name": {"value": current_name, "modifier": "EQUALS"}})
+
+        existing = stash.find_tags(
+            f={"name": {"value": current_name, "modifier": "EQUALS"}}
+        )
         if not existing:
             create_data = {"name": current_name}
             if parent_id:
@@ -155,28 +154,27 @@ def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
             parent_id = existing[0]["id"]
     return {"id": parent_id}
 
+
 def get_or_create_studio(stash: StashInterface, name: str) -> dict:
-    """Find or create studio"""
     studios = stash.find_studios(f={"name": {"value": name, "modifier": "EQUALS"}})
     return studios[0] if studios else stash.create_studio({"name": name})
 
 
 def get_or_create_performer(stash: StashInterface, name: str) -> dict:
-    """Find or create performer"""
-    performers = stash.find_performers(f={"name": {"value": name, "modifier": "EQUALS"}})
+    performers = stash.find_performers(
+        f={"name": {"value": name, "modifier": "EQUALS"}}
+    )
     return performers[0] if performers else stash.create_performer({"name": name})
 
 
 def scrape_image(client: StashInterface, image_id: str) -> None:
-    """Main scraping handler"""
-    # same logic as before for MD5 extraction and process_e621_post call
     image = client.find_image(image_id)
     if not image or not image.get("visual_files"):
         return
 
     file_data = image["visual_files"][0]
     filename = file_data["basename"]
-    filename_md5 = filename.split('.')[0]
+    filename_md5 = filename.split(".")[0]
 
     if re.match(r"^[a-f0-9]{32}$", filename_md5):
         final_md5 = filename_md5
@@ -197,30 +195,41 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
 
 
 if __name__ == "__main__":
-    log.info("Starting tagger with pagination...")
+    log.info("Starting tagger with stable pagination snapshot...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
     config = stash.get_configuration().get("plugins", {})
-    settings = {
-        "SkipTags": "e621_tagged, e621_tag_failed",
-        "ExcludeOrganized": False
-    }
+    settings = {"SkipTags": "e621_tagged, e621_tag_failed", "ExcludeOrganized": False}
     settings.update(config.get("e621_tagger", {}))
 
+    # ensure e621 tags exist and get their ids
     e621_tagged = get_or_create_tag(stash, "e621_tagged")
     e621_failed = get_or_create_tag(stash, "e621_tag_failed")
 
-    skip_tags = [t.strip() for t in settings["SkipTags"].split(",") if t.strip()]
-    skip_tags = [st for st in skip_tags]
-    skip_tags.extend([e621_tagged["id"], e621_failed["id"]])
-
-    log.info("Fetching images in pages...")
-    for idx, image in enumerate(get_all_images(stash, skip_tags, settings["ExcludeOrganized"], per_page=100), start=1):
-        current_tags = [t["name"] for t in image.get("tags", [])]
-        if any(t in current_tags for t in skip_tags):
+    # resolve skip tag NAMES from settings to tag IDs
+    skip_tag_names = [n.strip() for n in settings["SkipTags"].split(",") if n.strip()]
+    skip_tag_ids = []
+    for name in skip_tag_names:
+        found = stash.find_tags(f={"name": {"value": name, "modifier": "EQUALS"}})
+        if found:
+            skip_tag_ids.append(found[0]["id"])
+    # always include the internal e621 tags (ensure ints)
+    skip_tag_ids.extend([e621_tagged["id"], e621_failed["id"]])
+
+    log.info("Fetching images in pages (stable snapshot)...")
+    images = get_all_images(
+        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=10
+    )
+    total = len(images) or 1
+
+    for idx, image in enumerate(images, start=1):
+        progress = idx / total
+        log.progress(progress)
+
+        current_tag_ids = [t["id"] for t in image.get("tags", [])]
+        if any(tid in current_tag_ids for tid in skip_tag_ids):
             log.info(f"Skipping image {image['id']} - contains skip tag")
             continue
 
-        log.progress(idx)
         scrape_image(stash, image["id"])

From c3639cc1cfd14a7604f5ed220246ff533737cecc Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Thu, 11 Sep 2025 21:14:12 +0200
Subject: [PATCH 05/10] e621_tagger - scenes

---
 plugins/e621_tagger/e621_tagger.py | 251 +++++++++++++++++++++++------
 1 file changed, 201 insertions(+), 50 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index 18754433..d80e1a83 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -6,17 +6,18 @@
 import requests
 import stashapi.log as log
 from stashapi.stashapp import StashInterface
+from typing import List
+
+
+MD5_RE = re.compile(r"^[a-f0-9]{32}$")
 
 
 def get_all_images(
     client: StashInterface,
-    skip_tag_ids: list[int],
+    skip_tag_ids: List[int],
     exclude_organized: bool,
     per_page: int = 100,
-) -> list[dict]:
-    """
-    Fetch all images (returns a stable list snapshot). Uses numeric tag IDs in skip_tag_ids.
-    """
+) -> List[dict]:
     page = 1
     all_images = []
     while True:
@@ -43,25 +44,80 @@ def get_all_images(
         if not images:
             break
 
-        log.info(f"Fetched page {page} with {len(images)} images")
+        log.info(f"Fetched image page {page} with {len(images)} images")
         all_images.extend(images)
         page += 1
 
     return all_images
 
 
-def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> None:
-    image = stash.find_image(image_id)
-    if any(t["name"] == "e621_tagged" for t in image.get("tags", [])):
-        return
+def get_all_scenes(
+    client: StashInterface,
+    skip_tag_ids: List[int],
+    exclude_organized: bool,
+    per_page: int = 100,
+) -> List[dict]:
+    page = 1
+    all_scenes = []
+    while True:
+        scene_filter = {}
+        pagination = {
+            "page": page,
+            "per_page": per_page,
+            "sort": "created_at",
+            "direction": "ASC",
+        }
+
+        if skip_tag_ids:
+            scene_filter["tags"] = {
+                "value": [],
+                "excludes": skip_tag_ids,
+                "modifier": "INCLUDES_ALL",
+                "depth": -1,
+            }
+
+        if exclude_organized:
+            scene_filter["organized"] = False
+
+        scenes = client.find_scenes(f=scene_filter, filter=pagination)
+        if not scenes:
+            break
+
+        log.info(f"Fetched scene page {page} with {len(scenes)} scenes")
+        all_scenes.extend(scenes)
+        page += 1
+
+    return all_scenes
+
 
-    if any(t["name"] == "e621_tag_failed" for t in image.get("tags", [])):
+def process_e621_post_for_item(
+    stash: StashInterface, item_type: str, item_id: str, item_md5: str
+) -> None:
+    """
+    item_type: "image" or "scene"
+    Update the corresponding object on success, or tag as failed on API error.
+    """
+    # Fetch latest object to check tags
+    if item_type == "image":
+        obj = stash.find_image(item_id)
+        already_tagged = any(t["name"] == "e621_tagged" for t in obj.get("tags", []))
+        already_failed = any(
+            t["name"] == "e621_tag_failed" for t in obj.get("tags", [])
+        )
+    else:
+        obj = stash.find_scene(item_id)
+        already_tagged = any(t["name"] == "e621_tagged" for t in obj.get("tags", []))
+        already_failed = any(
+            t["name"] == "e621_tag_failed" for t in obj.get("tags", [])
+        )
+
+    if already_tagged or already_failed:
         return
 
     try:
         time.sleep(0.5)
         response = requests.get(
-            f"https://e621.net/posts.json?md5={image_md5}",
+            f"https://e621.net/posts.json?md5={item_md5}",
             headers={"User-Agent": "Stash-e621-Tagger/1.0"},
             timeout=10,
         )
@@ -70,8 +126,11 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
     except Exception as e:
         log.error(f"Marking as failed. e621 API error: {str(e)}")
         e621_tag_failed = get_or_create_tag(stash, "e621_tag_failed")
-        fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in image.get("tags", [])]
-        stash.update_image({"id": image_id, "tag_ids": list(set(fail_ids))})
+        fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in obj.get("tags", [])]
+        if item_type == "image":
+            stash.update_image({"id": item_id, "tag_ids": list(set(fail_ids))})
+        else:
+            stash.update_scene({"id": item_id, "tag_ids": list(set(fail_ids))})
         return
 
     if not post_data:
@@ -81,7 +140,7 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
     post_url = f"https://e621.net/posts/{post_data['id']}"
 
     tag_ids = [e621_tag["id"]]
-    for cat in ["general", "species", "character", "artist", "copyright"]:
+    for cat in ["general", "species", "character", "artist", "copyright", "meta"]:
         for tag in post_data.get("tags", {}).get(cat, []):
             clean_tag = tag.strip()
             if not clean_tag:
@@ -102,17 +161,20 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         performer_ids.append(perf["id"])
 
     try:
-        stash.update_image(
-            {
-                "id": image_id,
-                "organized": True,
-                "urls": [post_url],
-                "tag_ids": list(set(tag_ids)),
-                "studio_id": studio_id,
-                "performer_ids": performer_ids,
-            }
-        )
-        log.info(f"Image updated: {image_id}")
+        update_payload = {
+            "id": item_id,
+            "organized": True,
+            "urls": [post_url],
+            "tag_ids": list(set(tag_ids)),
+            "studio_id": studio_id,
+            "performer_ids": performer_ids,
+        }
+        if item_type == "image":
+            stash.update_image(update_payload)
+            log.info(f"Image updated: {item_id}")
+        else:
+            stash.update_scene(update_payload)
+            log.info(f"Scene updated: {item_id}")
     except Exception as e:
         log.error(f"Update failed: {str(e)}")
 
@@ -173,29 +235,92 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
         return
 
     file_data = image["visual_files"][0]
-    filename = file_data["basename"]
-    filename_md5 = filename.split(".")[0]
+    filename = file_data.get("basename", "")
+    filename_md5 = filename.split(".")[0] if filename else ""
 
-    if re.match(r"^[a-f0-9]{32}$", filename_md5):
+    if MD5_RE.match(filename_md5):
         final_md5 = filename_md5
-        log.info(f"Using filename MD5: {final_md5}")
+        log.info(f"Using filename MD5 for image: {final_md5}")
+    else:
+        # try if API provided checksum/md5 field on image
+        if image.get("checksum"):
+            final_md5 = image["checksum"]
+            log.info(f"Using image checksum: {final_md5}")
+        elif image.get("md5"):
+            final_md5 = image["md5"]
+            log.info(f"Using image md5: {final_md5}")
+        else:
+            try:
+                md5_hash = hashlib.md5()
+                with open(file_data["path"], "rb") as f:
+                    for chunk in iter(lambda: f.read(65536), b""):
+                        md5_hash.update(chunk)
+                final_md5 = md5_hash.hexdigest()
+                log.info(f"Generated content MD5 for image: {final_md5}")
+            except Exception as e:
+                log.error(f"Failed to generate MD5 for image: {str(e)}")
+                return
+
+    process_e621_post_for_item(client, "image", image_id, final_md5)
+
+
+def scrape_scene(client: StashInterface, scene_id: str) -> None:
+    """
+    Attempt to find a stable MD5 for a scene/video:
+      - prefer scene.checksum or scene.md5
+      - then files[0].checksum
+      - then files[0].basename parsed for md5
+      - fallback: compute MD5 from files[0].path
+    """
+    scene = client.find_scene(scene_id)
+    if not scene:
+        return
+
+    final_md5 = None
+
+    # direct fields
+    if scene.get("checksum") and MD5_RE.match(scene.get("checksum")):
+        final_md5 = scene.get("checksum")
+        log.info(f"Using scene checksum: {final_md5}")
+    elif scene.get("md5") and MD5_RE.match(scene.get("md5")):
+        final_md5 = scene.get("md5")
+        log.info(f"Using scene md5: {final_md5}")
     else:
-        try:
-            md5_hash = hashlib.md5()
-            with open(file_data["path"], "rb") as f:
-                for chunk in iter(lambda: f.read(65536), b""):
-                    md5_hash.update(chunk)
-            final_md5 = md5_hash.hexdigest()
-            log.info(f"Generated content MD5: {final_md5}")
-        except Exception as e:
-            log.error(f"Failed to generate MD5: {str(e)}")
+        files = scene.get("files") or scene.get("scene_files") or []
+        if files:
+            file_data = files[0]
+            # try file-level checksum
+            if file_data.get("checksum") and MD5_RE.match(file_data.get("checksum")):
+                final_md5 = file_data.get("checksum")
+                log.info(f"Using file checksum for scene: {final_md5}")
+            else:
+                basename = file_data.get("basename", "")
+                filename_md5 = basename.split(".")[0] if basename else ""
+                if MD5_RE.match(filename_md5):
+                    final_md5 = filename_md5
+                    log.info(f"Using filename MD5 for scene: {final_md5}")
+                else:
+                    # attempt to compute
+                    try:
+                        md5_hash = hashlib.md5()
+                        with open(file_data["path"], "rb") as f:
+                            for chunk in iter(lambda: f.read(65536), b""):
+                                md5_hash.update(chunk)
+                        final_md5 = md5_hash.hexdigest()
+                        log.info(f"Generated content MD5 for scene: {final_md5}")
+                    except Exception as e:
+                        log.error(f"Failed to generate MD5 for scene: {str(e)}")
+                        return
+        else:
+            log.error(f"No files found for scene {scene_id}; cannot compute md5")
             return
 
-    process_e621_post(client, image_id, final_md5)
+    if final_md5:
+        process_e621_post_for_item(client, "scene", scene_id, final_md5)
 
 
 if __name__ == "__main__":
-    log.info("Starting tagger with stable pagination snapshot...")
+    log.info("Starting tagger with stable pagination snapshot (images + scenes)...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
@@ -209,7 +334,7 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
 
     # resolve skip tag NAMES from settings to tag IDs
     skip_tag_names = [n.strip() for n in settings["SkipTags"].split(",") if n.strip()]
-    skip_tag_ids = []
+    skip_tag_ids: List[int] = []
     for name in skip_tag_names:
         found = stash.find_tags(f={"name": {"value": name, "modifier": "EQUALS"}})
         if found:
@@ -219,17 +344,43 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
 
     log.info("Fetching images in pages (stable snapshot)...")
     images = get_all_images(
-        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=10
+        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=50
+    )
+    log.info("Fetching scenes in pages (stable snapshot)...")
+    scenes = get_all_scenes(
+        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=50
     )
-    total = len(images) or 1
 
-    for idx, image in enumerate(images, start=1):
-        progress = idx / total
-        log.progress(progress)
+    # build unified list with type so we can preserve progress and skipping logic
+    unified = []
+    for img in images:
+        unified.append({"type": "image", "obj": img})
+    for sc in scenes:
+        unified.append({"type": "scene", "obj": sc})
+
+    total = len(unified) or 1
+    for idx, entry in enumerate(unified, start=1):
+        # report start-of-item progress (0..1). avoid sending 1.0 until the very end.
+        log.progress(float(idx - 1) / float(total))
+
+        item_type = entry["type"]
+        item = entry["obj"]
+        item_id = item["id"]
 
-        current_tag_ids = [t["id"] for t in image.get("tags", [])]
+        current_tag_ids = [t["id"] for t in item.get("tags", [])]
         if any(tid in current_tag_ids for tid in skip_tag_ids):
-            log.info(f"Skipping image {image['id']} - contains skip tag")
+            log.info(f"Skipping {item_type} {item_id} - contains skip tag")
+            # reflect the skipped item as completed
+            log.progress(float(idx) / float(total))
             continue
 
-        scrape_image(stash, image["id"])
+        if item_type == "image":
+            scrape_image(stash, item_id)
+        else:
+            scrape_scene(stash, item_id)
+
+        # mark this item done
+        log.progress(float(idx) / float(total))
+
+    # ensure UI shows complete when finished
+    log.progress(1.0)

From d478cd9ddee4b32df09df64b5df122427a3b34b3 Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Thu, 11 Sep 2025 21:24:05 +0200
Subject: [PATCH 06/10] e621_tagger.py - added meta tags, removed character
 tags (they already set as performers)

---
 plugins/e621_tagger/e621_tagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index d80e1a83..3cdac16f 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -140,7 +140,7 @@ def process_e621_post_for_item(
     post_url = f"https://e621.net/posts/{post_data['id']}"
 
     tag_ids = [e621_tag["id"]]
-    for cat in ["general", "species", "character", "artist", "copyright", "meta"]:
+    for cat in ["general", "species", "artist", "copyright", "meta"]:
         for tag in post_data.get("tags", {}).get(cat, []):
             clean_tag = tag.strip()
             if not clean_tag:

From d18638a82ca17f8ceda42c57fb157f8d6af0468f Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Thu, 11 Sep 2025 21:24:36 +0200
Subject: [PATCH 07/10] e621_tagger.yml

---
 plugins/e621_tagger/e621_tagger.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/e621_tagger/e621_tagger.yml b/plugins/e621_tagger/e621_tagger.yml
index 9b8ecc28..727921d1 100644
--- a/plugins/e621_tagger/e621_tagger.yml
+++ b/plugins/e621_tagger/e621_tagger.yml
@@ -1,6 +1,6 @@
 name: e621_tagger
 description: Finding images and videos on e621 and tagging them.
-version: 0.2
+version: 0.3
 url: https://github.com/stashapp/CommunityScripts/
 exec:
   - python

From 12f2ab4339e36561060298032ae126af57278466 Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Thu, 11 Sep 2025 21:53:16 +0200
Subject: [PATCH 08/10] e621_tagger.py fixed pagination

---
 plugins/e621_tagger/e621_tagger.py | 150 +++++++++++++----------------
 1 file changed, 65 insertions(+), 85 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index 3cdac16f..1472ea2e 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -4,99 +4,97 @@
 import json
 import time
 import requests
+import itertools
 import stashapi.log as log
 from stashapi.stashapp import StashInterface
 from typing import List
 
-
 MD5_RE = re.compile(r"^[a-f0-9]{32}$")
 
 
-def get_all_images(
+def _build_filter(skip_tag_ids, exclude_organized):
+    f = {}
+    if skip_tag_ids:
+        f["tags"] = {
+            "value": [],
+            "excludes": skip_tag_ids,
+            "modifier": "INCLUDES_ALL",
+            "depth": -1,
+        }
+    if exclude_organized:
+        f["organized"] = False
+    return f
+
+
+def count_images(
+    client: StashInterface, skip_tag_ids: list, exclude_organized: bool
+) -> int:
+    image_filter = _build_filter(skip_tag_ids, exclude_organized)
+    pagination = {"page": 1, "per_page": 0, "sort": "created_at", "direction": "ASC"}
+    total, _ = client.find_images(f=image_filter, filter=pagination, get_count=True)
+    return total
+
+
+def count_scenes(
+    client: StashInterface, skip_tag_ids: list, exclude_organized: bool
+) -> int:
+    scene_filter = _build_filter(skip_tag_ids, exclude_organized)
+    pagination = {"page": 1, "per_page": 0, "sort": "created_at", "direction": "ASC"}
+    total, _ = client.find_scenes(f=scene_filter, filter=pagination, get_count=True)
+    return total
+
+
+def stream_images(
     client: StashInterface,
     skip_tag_ids: List[int],
     exclude_organized: bool,
     per_page: int = 100,
-) -> List[dict]:
+):
     page = 1
-    all_images = []
+    base_filter = _build_filter(skip_tag_ids, exclude_organized)
     while True:
-        image_filter = {}
         pagination = {
             "page": page,
             "per_page": per_page,
             "sort": "created_at",
             "direction": "ASC",
         }
-
-        if skip_tag_ids:
-            image_filter["tags"] = {
-                "value": [],
-                "excludes": skip_tag_ids,
-                "modifier": "INCLUDES_ALL",
-                "depth": -1,
-            }
-
-        if exclude_organized:
-            image_filter["organized"] = False
-
-        images = client.find_images(f=image_filter, filter=pagination)
+        images = client.find_images(f=base_filter, filter=pagination)
         if not images:
             break
-
         log.info(f"Fetched image page {page} with {len(images)} images")
-        all_images.extend(images)
+        for img in images:
+            yield ("image", img)
         page += 1
 
-    return all_images
-
 
-def get_all_scenes(
+def stream_scenes(
     client: StashInterface,
     skip_tag_ids: List[int],
     exclude_organized: bool,
     per_page: int = 100,
-) -> List[dict]:
+):
     page = 1
-    all_scenes = []
+    base_filter = _build_filter(skip_tag_ids, exclude_organized)
     while True:
-        scene_filter = {}
         pagination = {
             "page": page,
             "per_page": per_page,
             "sort": "created_at",
             "direction": "ASC",
         }
-
-        if skip_tag_ids:
-            scene_filter["tags"] = {
-                "value": [],
-                "excludes": skip_tag_ids,
-                "modifier": "INCLUDES_ALL",
-                "depth": -1,
-            }
-
-        if exclude_organized:
-            scene_filter["organized"] = False
-
-        scenes = client.find_scenes(f=scene_filter, filter=pagination)
+        scenes = client.find_scenes(f=base_filter, filter=pagination)
         if not scenes:
             break
-
         log.info(f"Fetched scene page {page} with {len(scenes)} scenes")
-        all_scenes.extend(scenes)
+        for sc in scenes:
+            yield ("scene", sc)
         page += 1
 
-    return all_scenes
-
 
 def process_e621_post_for_item(
     stash: StashInterface, item_type: str, item_id: str, item_md5: str
 ) -> None:
-    """
-    item_type: "image" or "scene"
-    Update the corresponding object on success, or tag as failed on API error.
-    """
     # Fetch latest object to check tags
     if item_type == "image":
         obj = stash.find_image(item_id)
@@ -242,7 +240,6 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
         final_md5 = filename_md5
         log.info(f"Using filename MD5 for image: {final_md5}")
     else:
-        # try if API provided checksum/md5 field on image
         if image.get("checksum"):
             final_md5 = image["checksum"]
             log.info(f"Using image checksum: {final_md5}")
@@ -265,20 +262,12 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
 
 
 def scrape_scene(client: StashInterface, scene_id: str) -> None:
-    """
-    Attempt to find a stable MD5 for a scene/video:
-      - prefer scene.checksum or scene.md5
-      - then files[0].checksum
-      - then files[0].basename parsed for md5
-      - fallback: compute MD5 from files[0].path
-    """
     scene = client.find_scene(scene_id)
     if not scene:
         return
 
     final_md5 = None
 
-    # direct fields
     if scene.get("checksum") and MD5_RE.match(scene.get("checksum")):
         final_md5 = scene.get("checksum")
         log.info(f"Using scene checksum: {final_md5}")
@@ -289,7 +278,6 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
         files = scene.get("files") or scene.get("scene_files") or []
         if files:
             file_data = files[0]
-            # try file-level checksum
             if file_data.get("checksum") and MD5_RE.match(file_data.get("checksum")):
                 final_md5 = file_data.get("checksum")
                 log.info(f"Using file checksum for scene: {final_md5}")
@@ -300,7 +288,6 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
                     final_md5 = filename_md5
                     log.info(f"Using filename MD5 for scene: {final_md5}")
                 else:
-                    # attempt to compute
                     try:
                         md5_hash = hashlib.md5()
                         with open(file_data["path"], "rb") as f:
@@ -320,7 +307,7 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
 
 
 if __name__ == "__main__":
-    log.info("Starting tagger with stable pagination snapshot (images + scenes)...")
+    log.info("Starting tagger with stable pagination snapshot (streamed)...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
@@ -328,49 +315,44 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
     settings = {"SkipTags": "e621_tagged, e621_tag_failed", "ExcludeOrganized": False}
     settings.update(config.get("e621_tagger", {}))
 
-    # ensure e621 tags exist and get their ids
     e621_tagged = get_or_create_tag(stash, "e621_tagged")
     e621_failed = get_or_create_tag(stash, "e621_tag_failed")
 
-    # resolve skip tag NAMES from settings to tag IDs
     skip_tag_names = [n.strip() for n in settings["SkipTags"].split(",") if n.strip()]
     skip_tag_ids: List[int] = []
     for name in skip_tag_names:
         found = stash.find_tags(f={"name": {"value": name, "modifier": "EQUALS"}})
         if found:
             skip_tag_ids.append(found[0]["id"])
-    # always include the internal e621 tags (ensure ints)
     skip_tag_ids.extend([e621_tagged["id"], e621_failed["id"]])
 
-    log.info("Fetching images in pages (stable snapshot)...")
-    images = get_all_images(
-        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=50
-    )
-    log.info("Fetching scenes in pages (stable snapshot)...")
-    scenes = get_all_scenes(
-        stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=50
-    )
+    per_page = 50
+
+    log.info("Counting images (no storage)...")
+    num_images = count_images(stash, skip_tag_ids, settings["ExcludeOrganized"])
+    log.info("Counting scenes (no storage)...")
+    num_scenes = count_scenes(stash, skip_tag_ids, settings["ExcludeOrganized"])
+
+    total = (num_images + num_scenes) or 1
 
-    # build unified list with type so we can preserve progress and skipping logic
-    unified = []
-    for img in images:
-        unified.append({"type": "image", "obj": img})
-    for sc in scenes:
-        unified.append({"type": "scene", "obj": sc})
+    log.info(f"Total items (images + scenes): {total}")
 
-    total = len(unified) or 1
-    for idx, entry in enumerate(unified, start=1):
-        # report start-of-item progress (0..1). avoid sending 1.0 until the very end.
+    stream = itertools.chain(
+        stream_images(
+            stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
+        ),
+        stream_scenes(
+            stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
+        ),
+    )
+
+    for idx, (item_type, item) in enumerate(stream, start=1):
         log.progress(float(idx - 1) / float(total))
 
-        item_type = entry["type"]
-        item = entry["obj"]
         item_id = item["id"]
-
         current_tag_ids = [t["id"] for t in item.get("tags", [])]
         if any(tid in current_tag_ids for tid in skip_tag_ids):
             log.info(f"Skipping {item_type} {item_id} - contains skip tag")
-            # reflect the skipped item as completed
             log.progress(float(idx) / float(total))
             continue
 
@@ -379,8 +361,6 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
         else:
             scrape_scene(stash, item_id)
 
-        # mark this item done
         log.progress(float(idx) / float(total))
 
-    # ensure UI shows complete when finished
     log.progress(1.0)

From e6e13d5c435834e0a561397d422010e2e2bf6ed0 Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Fri, 12 Sep 2025 02:31:52 +0200
Subject: [PATCH 09/10] e621_tagger.py - fix pagination (again)

---
 plugins/e621_tagger/e621_tagger.py | 191 ++++++++++++++++++++++-------
 1 file changed, 147 insertions(+), 44 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index 1472ea2e..5f7306cb 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -7,7 +7,7 @@
 import itertools
 import stashapi.log as log
 from stashapi.stashapp import StashInterface
-from typing import List
+from typing import List, Optional, Tuple
 
 MD5_RE = re.compile(r"^[a-f0-9]{32}$")
 
@@ -94,7 +94,11 @@ def stream_scenes(
 
 def process_e621_post_for_item(
     stash: StashInterface, item_type: str, item_id: str, item_md5: str
-) -> None:
+) -> bool:
+    """
+    CHANGED: return boolean indicating whether the item was updated/marked (True) or left untouched (False).
+    This lets the caller (main loop) increment progress only when an item actually changed state.
+    """
     # Fetch latest object to check tags
     if item_type == "image":
         obj = stash.find_image(item_id)
@@ -110,7 +114,7 @@ def process_e621_post_for_item(
         )
 
     if already_tagged or already_failed:
-        return
+        return False  # nothing to do
 
     try:
         time.sleep(0.5)
@@ -125,14 +129,19 @@ def process_e621_post_for_item(
         log.error(f"Marking as failed. e621 API error: {str(e)}")
         e621_tag_failed = get_or_create_tag(stash, "e621_tag_failed")
         fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in obj.get("tags", [])]
-        if item_type == "image":
-            stash.update_image({"id": item_id, "tag_ids": list(set(fail_ids))})
-        else:
-            stash.update_scene({"id": item_id, "tag_ids": list(set(fail_ids))})
-        return
+        try:
+            if item_type == "image":
+                stash.update_image({"id": item_id, "tag_ids": list(set(fail_ids))})
+            else:
+                stash.update_scene({"id": item_id, "tag_ids": list(set(fail_ids))})
+            return True
+        except Exception as e2:
+            log.error(f"Failed to mark as failed: {str(e2)}")
+            return False
 
     if not post_data:
-        return
+        # not found on e621: leave untouched so it can be retried later (or user may decide to mark failed)
+        return False
 
     e621_tag = get_or_create_tag(stash, "e621_tagged")
     post_url = f"https://e621.net/posts/{post_data['id']}"
@@ -173,8 +182,10 @@ def process_e621_post_for_item(
         else:
             stash.update_scene(update_payload)
             log.info(f"Scene updated: {item_id}")
+        return True
     except Exception as e:
         log.error(f"Update failed: {str(e)}")
+        return False
 
 
 def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
@@ -227,10 +238,13 @@ def get_or_create_performer(stash: StashInterface, name: str) -> dict:
     return performers[0] if performers else stash.create_performer({"name": name})
 
 
-def scrape_image(client: StashInterface, image_id: str) -> None:
+def scrape_image(client: StashInterface, image_id: str) -> bool:
+    """
+    PAGINATION: return True if item was updated/marked (so main loop can count progress).
+    """
     image = client.find_image(image_id)
     if not image or not image.get("visual_files"):
-        return
+        return False
 
     file_data = image["visual_files"][0]
     filename = file_data.get("basename", "")
@@ -256,15 +270,18 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
                 log.info(f"Generated content MD5 for image: {final_md5}")
             except Exception as e:
                 log.error(f"Failed to generate MD5 for image: {str(e)}")
-                return
+                return False
 
-    process_e621_post_for_item(client, "image", image_id, final_md5)
+    return process_e621_post_for_item(client, "image", image_id, final_md5)
 
 
-def scrape_scene(client: StashInterface, scene_id: str) -> None:
+def scrape_scene(client: StashInterface, scene_id: str) -> bool:
+    """
+    PAGINATION: return True if item was updated/marked (so main loop can count progress).
+    """
     scene = client.find_scene(scene_id)
     if not scene:
-        return
+        return False
 
     final_md5 = None
 
@@ -297,17 +314,16 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
                         log.info(f"Generated content MD5 for scene: {final_md5}")
                     except Exception as e:
                         log.error(f"Failed to generate MD5 for scene: {str(e)}")
-                        return
+                        return False
         else:
             log.error(f"No files found for scene {scene_id}; cannot compute md5")
-            return
+            return False
 
-    if final_md5:
-        process_e621_post_for_item(client, "scene", scene_id, final_md5)
+    return process_e621_post_for_item(client, "scene", scene_id, final_md5)
 
 
 if __name__ == "__main__":
-    log.info("Starting tagger with stable pagination snapshot (streamed)...")
+    log.info("Starting tagger with scanning passes until no work left...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
@@ -337,30 +353,117 @@ def scrape_scene(client: StashInterface, scene_id: str) -> None:
 
     log.info(f"Total items (images + scenes): {total}")
 
-    stream = itertools.chain(
-        stream_images(
-            stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
-        ),
-        stream_scenes(
-            stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
-        ),
-    )
-
-    for idx, (item_type, item) in enumerate(stream, start=1):
-        log.progress(float(idx - 1) / float(total))
-
-        item_id = item["id"]
-        current_tag_ids = [t["id"] for t in item.get("tags", [])]
-        if any(tid in current_tag_ids for tid in skip_tag_ids):
-            log.info(f"Skipping {item_type} {item_id} - contains skip tag")
-            log.progress(float(idx) / float(total))
-            continue
-
-        if item_type == "image":
-            scrape_image(stash, item_id)
-        else:
-            scrape_scene(stash, item_id)
+    processed_count = 0
+    pass_num = 0
+    # Loop passes until a full pass processes zero items.
+    while True:
+        pass_num += 1
+        log.info(f"Starting scanning pass #{pass_num}")
+        pass_processed = 0
+
+        # Scan images by pages
+        page = 1
+        while True:
+            pagination = {
+                "page": page,
+                "per_page": per_page,
+                "sort": "created_at",
+                "direction": "ASC",
+            }
+            images = stash.find_images(f=_build_filter(skip_tag_ids, settings["ExcludeOrganized"]), filter=pagination)
+            log.info(f"[pass {pass_num}] fetched image page {page}, count={len(images)}")
+            if not images:
+                break
+            for img in images:
+                item_id = img.get("id")
+                if not item_id:
+                    log.error(f"[pass {pass_num}] image without id on page {page}")
+                    continue
+
+                # Defensive fetch of current tags to avoid race conditions
+                current = stash.find_image(item_id)
+                current_tag_ids = [t["id"] for t in current.get("tags", [])]
+                if any(tid in current_tag_ids for tid in skip_tag_ids):
+                    # Shouldn't usually happen because filter excluded them, but handle gracefully.
+                    log.info(f"[pass {pass_num}] skipping image {item_id} - now has skip tag")
+                    processed_count += 1
+                    pass_processed += 1
+                    log.progress(float(processed_count) / float(total))
+                    continue
+
+                # Attempt to process; scrape_image now returns True if it updated/marked the item.
+                try:
+                    updated = scrape_image(stash, item_id)
+                except Exception as e:
+                    log.error(f"[pass {pass_num}] scrape_image exception for {item_id}: {str(e)}")
+                    updated = False
+
+                if updated:
+                    processed_count += 1
+                    pass_processed += 1
+                    log.info(f"[pass {pass_num}] processed image {item_id} (processed_count={processed_count})")
+                    log.progress(float(processed_count) / float(total))
+                # If not updated, it will remain in future passes. Continue scanning.
+
+            # If fewer than per_page results, we're at the end of current snapshot
+            if len(images) < per_page:
+                break
+            page += 1
+
+        # Scan scenes by pages
+        page = 1
+        while True:
+            pagination = {
+                "page": page,
+                "per_page": per_page,
+                "sort": "created_at",
+                "direction": "ASC",
+            }
+            scenes = stash.find_scenes(f=_build_filter(skip_tag_ids, settings["ExcludeOrganized"]), filter=pagination)
+            log.info(f"[pass {pass_num}] fetched scene page {page}, count={len(scenes)}")
+            if not scenes:
+                break
+            for sc in scenes:
+                item_id = sc.get("id")
+                if not item_id:
+                    log.error(f"[pass {pass_num}] scene without id on page {page}")
+                    continue
+
+                # Defensive fetch
+                current = stash.find_scene(item_id)
+                current_tag_ids = [t["id"] for t in current.get("tags", [])]
+                if any(tid in current_tag_ids for tid in skip_tag_ids):
+                    log.info(f"[pass {pass_num}] skipping scene {item_id} - now has skip tag")
+                    processed_count += 1
+                    pass_processed += 1
+                    log.progress(float(processed_count) / float(total))
+                    continue
+
+                try:
+                    updated = scrape_scene(stash, item_id)
+                except Exception as e:
+                    log.error(f"[pass {pass_num}] scrape_scene exception for {item_id}: {str(e)}")
+                    updated = False
+
+                if updated:
+                    processed_count += 1
+                    pass_processed += 1
+                    log.info(f"[pass {pass_num}] processed scene {item_id} (processed_count={processed_count})")
+                    log.progress(float(processed_count) / float(total))
+
+            if len(scenes) < per_page:
+                break
+            page += 1
+
+        log.info(f"Pass #{pass_num} finished. items processed this pass: {pass_processed}")
+
+        # If no items processed in a full pass, we're done
+        if pass_processed == 0:
+            log.info("No items processed in last pass; finishing scan.")
+            break
 
-        log.progress(float(idx) / float(total))
+        # Small sleep to avoid hammering API and to let the DB settle between passes
+        time.sleep(0.2)
 
+    # ensure progress finished
     log.progress(1.0)

From f435f079a82499fec4f0f03cd3a04f2377495a8b Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Fri, 12 Sep 2025 02:32:09 +0200
Subject: [PATCH 10/10] e621_tagger.yml - fix pagination (again)

---
 plugins/e621_tagger/e621_tagger.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/e621_tagger/e621_tagger.yml b/plugins/e621_tagger/e621_tagger.yml
index 727921d1..911080ff 100644
--- a/plugins/e621_tagger/e621_tagger.yml
+++ b/plugins/e621_tagger/e621_tagger.yml
@@ -1,6 +1,6 @@
 name: e621_tagger
 description: Finding images and videos on e621 and tagging them.
-version: 0.3
+version: 0.4
 url: https://github.com/stashapp/CommunityScripts/
 exec:
   - python