## Merge Multiple Label Studio Annotator Exports — ARI3129 Assignment 2025/26


This notebook combines 2–4 Label Studio JSON files (each from a different annotator) and 2–4 ZIPs of images into a single unified dataset.

Output:
- `Individuals` - folder containing all uploaded per-annotator original JSON and ZIP files (saved as `input_{name}.json` and `images_{name}.zip`)
- `Merger/merged_input.json` — merged Label Studio task array (all annotations combined)
- `Merger/merged_images.zip` — flattened, deduplicated set of all unique images

## 1) Environment and Dependency Setup

This section ensures that all required Python packages are available before the notebook is run.  
If any dependency such as `ipywidgets` or `IPython` is missing, it is automatically installed using `pip`.

In [None]:
from __future__ import annotations

# --- Standard library ---
import importlib
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict, List, Tuple, Set
from collections import defaultdict
from datetime import datetime, timezone
import hashlib
import json
import os
import shutil
import zipfile

# --- Package bootstrap helper ---
def ensure_package(pkg: str, import_name: str | None = None):
    try:
        return importlib.import_module(import_name or pkg)
    except ImportError:
        print(f"Installing missing package: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        return importlib.import_module(import_name or pkg)

# --- Ensure and import third-party packages ---
ensure_package("ipywidgets")
ensure_package("IPython.display", "IPython.display")

import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

print("Environment ready: required dependencies installed and imported.")

## 2) Consolidate Members’ Exports

This section merges multiple team members’ **Label Studio JSON** files and their **images ZIPs** into a single, consolidated set for downstream conversion.

**How to use**
1. Set **Team size** (2–4) and click **Set team size**.  
2. For each member, enter **Full name**, upload **Annotation JSON** and **Images ZIP**.  
3. Click **Merge**.  
4. Check the output panel for paths, counts, and any validation errors.

<div style="border:3px solid #c00; background:#fff7f7; padding:12px; border-radius:6px">
    <strong style="color:#000000">⚠️ IMPORTANT</strong>
    <ul>
        <li style="color:#000000">If a white box appears with no buttons or widgets, restart the kernel and re-run the notebook cells one by one.</li>
        <li style="color:#000000">Ensure that you upload each team member's <strong>JSON</strong> and <strong>ZIP</strong> pair corresponding to their own Label Studio annotations and images.</li>
    </ul>
</div>


In [None]:
# ---- Paths ----
ROOT_DIR    = Path.cwd()
MERGER_DIR  = ROOT_DIR / "Merger"
INDIV_DIR   = MERGER_DIR / "Individuals"
IMG_WORK    = MERGER_DIR / "Temp"  # temporary extraction workspace
for d in (MERGER_DIR, INDIV_DIR, IMG_WORK):
    d.mkdir(parents=True, exist_ok=True)

IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

# ---- Widgets (step 1: team size) ----
team_size_input = widgets.BoundedIntText(value=2, min=2, max=4, step=1, description="Team size")
build_team_btn  = widgets.Button(description="Set team size", button_style="info")
team_box        = widgets.VBox([widgets.HTML("<b>Step 1)</b> Enter number of team members (2–4)."), team_size_input, build_team_btn])

# Placeholder containers (populated after size set)
members_ui_box = widgets.VBox([])
run_btn        = widgets.Button(description="Merge to merged_input.json + merged_images.zip", button_style="success", icon="check")
out_box        = widgets.Output(layout=widgets.Layout(border="1px solid #ddd", padding="8px"))

display(widgets.VBox([
    team_box,
    widgets.HTML("<hr>"),
    widgets.HTML("<b>Step 2)</b> Provide full name and uploads for each member."),
    members_ui_box,
    widgets.HTML("<hr>"),
    run_btn,
    out_box
]))

# ---- Helpers ----
def _safe_name(s: str) -> str:
    return "".join(ch if ch.isalnum() or ch in (" ", "_", "-", ".") else "_" for ch in (s or "").strip())

def _save_fileupload_single(u: widgets.FileUpload, dest_path: Path) -> Path:
    v = u.value
    if not v:
        raise RuntimeError("No file uploaded.")

    # ipywidgets 7: dict of {filename: {content: ...}}
    if isinstance(v, dict):
        first = next(iter(v.values()))
    # ipywidgets 8: tuple/list of {name, type, size, last_modified, content}
    elif isinstance(v, (tuple, list)):
        first = v[0]
    else:
        raise RuntimeError(f"Unexpected FileUpload.value type: {type(v)}")

    with dest_path.open("wb") as f:
        f.write(first["content"])
    return dest_path

def _load_ls_array(p: Path) -> List[Dict[str, Any]]:
    with p.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError(f"{p.name} is not a Label Studio task array.")
    return data

def _task_key(task: Dict[str, Any]) -> str:
    fu = task.get("file_upload")
    if fu:
        return str(fu)
    img = (task.get("data") or {}).get("image", "")
    return Path(img).name or str(img)

def _merge_ls_tasks(task_lists: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
    bucket: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
    for tl in task_lists:
        for t in tl:
            bucket[_task_key(t)].append(t)

    merged: List[Dict[str, Any]] = []
    for key, tasks in bucket.items():
        tasks_sorted = sorted(tasks, key=lambda t: len(t.get("annotations", [])), reverse=True)
        base = json.loads(json.dumps(tasks_sorted[0]))  # deep copy
        all_anns: List[Dict[str, Any]] = []
        for t in tasks:
            anns = t.get("annotations", []) or []
            all_anns.extend(anns)

        new_anns: List[Dict[str, Any]] = []
        next_id = 1
        for ann in all_anns:
            ann_copy = json.loads(json.dumps(ann))
            ann_copy["id"] = next_id
            next_id += 1
            for r in ann_copy.get("result", []) or []:
                r["id"] = r.get("id") or f"res_{ann_copy['id']}_{hashlib.md5(str(r).encode()).hexdigest()[:6]}"
            new_anns.append(ann_copy)

        base["annotations"] = new_anns
        merged.append(base)
    return merged

def _sha1_of_file(p: Path, buf_size: int = 1 << 20) -> str:
    h = hashlib.sha1()
    with p.open("rb") as f:
        while True:
            b = f.read(buf_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def _extract_and_flatten_zips(zip_paths: List[Path], dest_dir: Path) -> Tuple[int, int]:
    """Extract all zips into dest_dir, dedupe by content hash, handle name clashes with short hash suffix."""
    if dest_dir.exists():
        shutil.rmtree(dest_dir)
    dest_dir.mkdir(parents=True, exist_ok=True)

    seen_hash: Dict[str, str] = {}
    written = 0
    skipped = 0

    tmp_root = dest_dir.parent / "_unzips_tmp"
    if tmp_root.exists():
        shutil.rmtree(tmp_root)
    tmp_root.mkdir(parents=True, exist_ok=True)

    try:
        for zp in zip_paths:
            with zipfile.ZipFile(zp, "r") as zf:
                zf.extractall(tmp_root)

        for p in tmp_root.rglob("*"):
            if not p.is_file():
                continue
            if p.name.startswith("._") or "DS_Store" in p.name:
                continue
            if p.suffix.lower() not in IMAGE_EXTENSIONS:
                continue

            sha1 = _sha1_of_file(p)
            if sha1 in seen_hash:
                skipped += 1
                continue

            base, ext = Path(p.name).stem, Path(p.name).suffix
            dst = dest_dir / p.name
            if dst.exists():  # different content, same name
                dst = dest_dir / f"{base}_{sha1[:8]}{ext}"
            shutil.copy2(p, dst)
            seen_hash[sha1] = dst.name
            written += 1
    finally:
        shutil.rmtree(tmp_root, ignore_errors=True)

    return written, skipped

def _zip_dir(src_dir: Path, out_zip: Path):
    if out_zip.exists():
        out_zip.unlink()
    with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for p in sorted(src_dir.rglob("*")):
            if p.is_file():
                zf.write(p, p.relative_to(src_dir))

# ---- Build dynamic per-member UI ----
member_rows: List[Dict[str, Any]] = []

def _build_member_rows(_):
    members_ui_box.children = []
    member_rows.clear()
    n = int(team_size_input.value)
    if n < 2 or n > 4:
        with out_box:
            print("Team size must be between 2 and 4.")
        return

    rows = []
    for i in range(1, n+1):
        name_w = widgets.Text(description=f"Member {i}:", placeholder="Full name")
        json_u = widgets.FileUpload(accept=".json", multiple=False, description="Upload JSON")
        zip_u  = widgets.FileUpload(accept=".zip",  multiple=False, description="Upload ZIP")
        row = widgets.VBox([
            widgets.HTML(f"<b>Member {i}</b>"),
            name_w,
            widgets.HBox([widgets.HTML("Annotation JSON:"), json_u]),
            widgets.HBox([widgets.HTML("Images ZIP:"), zip_u]),
            widgets.HTML("<hr>")
        ])
        rows.append(row)
        member_rows.append({"name_widget": name_w, "json_upl": json_u, "zip_upl": zip_u})
    members_ui_box.children = rows

build_team_btn.on_click(_build_member_rows)

# ---- Main action ----
def _run_merge(_):
    with out_box:
        clear_output()
        try:
            if not member_rows:
                raise RuntimeError("Set team size and fill member sections first.")

            saved_jsons, saved_zips = [], []

            for entry in member_rows:
                full_name = entry["name_widget"].value.strip()
                if not full_name:
                    raise RuntimeError("Each member must have a full name.")
                safe_full = _safe_name(full_name).replace(" ", "_")

                json_upl, zip_upl = entry["json_upl"], entry["zip_upl"]
                if not json_upl.value:
                    raise RuntimeError(f"No JSON uploaded for {full_name}.")
                if not zip_upl.value:
                    raise RuntimeError(f"No ZIP uploaded for {full_name}.")

                json_path = INDIV_DIR / f"input_{safe_full}.json"
                zip_path  = INDIV_DIR / f"images_{safe_full}.zip"
                _save_fileupload_single(json_upl, json_path)
                _save_fileupload_single(zip_upl,  zip_path)
                saved_jsons.append(json_path)
                saved_zips.append(zip_path)

            task_lists = [_load_ls_array(p) for p in saved_jsons]
            merged_tasks = _merge_ls_tasks(task_lists)
            merged_input_path = MERGER_DIR / "merged_input.json"
            with merged_input_path.open("w", encoding="utf-8") as f:
                json.dump(merged_tasks, f, ensure_ascii=False, indent=2)

            written, skipped = _extract_and_flatten_zips(saved_zips, IMG_WORK)
            merged_zip_path = MERGER_DIR / "merged_images.zip"
            _zip_dir(IMG_WORK, merged_zip_path)

            # delete temporary folder after merge
            shutil.rmtree(IMG_WORK, ignore_errors=True)

            display(Markdown("**Merge complete.**"))
            print(f"Saved individuals in: {INDIV_DIR}")
            print(f"Merged JSON: {merged_input_path}")
            print(f"Merged images ZIP: {merged_zip_path}")
            print(f"Images written: {written} | Duplicates skipped: {skipped}")
            print(f"Merged tasks: {len(merged_tasks)}")

        except Exception as exc:
            display(Markdown(f"**Error:** {exc}"))

run_btn.on_click(_run_merge)

## Next Step

Once `merged_input.json` and `merged_images.zip` are generated in the `Merge/` directory, proceed to the conversion and validation notebook `(LS2COCO.ipynb)` that converts the merged Label Studio (JSON) export into COCO format, auto-matches image files, enriches attributes, and prepares the dataset for training.