In [1]:
from __future__ import annotations

import argparse
import concurrent.futures as futures
import hashlib
import os
import re
import shutil
import sys
import threading
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Tuple, Optional, Union

In [2]:
try:
    from PIL import Image  # type: ignore
    PIL_OK = True
except Exception:
    PIL_OK = False

try:
    import piexif  # type: ignore
    PIEXIF_OK = True
except Exception:
    PIEXIF_OK = False

# Patterns and defaults
SM_DIR_GLOB = re.compile(r"^SM_[A-Za-z0-9]+$", re.IGNORECASE)

# Locks and global counter
progress_lock = threading.Lock()
dest_lock = threading.Lock()
counter_lock = threading.Lock()
_global_counter = 0  # will be incremented to produce 000001, 000002, ...

In [3]:
def ts() -> str:
    return f"[{datetime.now().strftime('%Y%m%d %H:%M:%S')}]"

def safe_makedirs(path: str) -> None:
    os.makedirs(path, exist_ok=True)

def is_wanted_ext(path: str, allowed_exts: set[str]) -> bool:
    return os.path.splitext(path)[1].lower() in allowed_exts

def parse_exif_to_datepart(raw: Union[str, bytes, None]) -> Optional[str]:
    """
    Convert EXIF-like date strings to 'YYYY-MM-DD HH:MM:SS' style or date-only.
    Accepts 'YYYY:MM:DD HH:MM:SS' or variants.
    """
    if not raw:
        return None
    if isinstance(raw, bytes):
        raw = raw.decode("utf-8", "ignore")
    raw = raw.strip()
    if not raw:
        return None
    # Replace separators so strptime can handle
    raw = raw.replace(":", "-", 2)  # change only the first two colons between Y:M:D
    raw = raw.replace("/", "-")
    return raw

def extract_capture_datetime(path: str) -> Tuple[Optional[datetime], str]:
    """
    Try to extract a full datetime (naive local) from EXIF (DateTimeOriginal / DateTime / DateTimeDigitized)
    or fall back to filesystem mtime. Returns (datetime_obj, source_label).
    """
    # PIL route
    if PIL_OK:
        try:
            with Image.open(path) as im:
                exif = im.getexif()
                if exif:
                    for tag, label in ((36867, "EXIF:DateTimeOriginal"),
                                       (306, "EXIF:ModifyDate"),
                                       (36868, "EXIF:DateTimeDigitized")):
                        raw = exif.get(tag)
                        txt = parse_exif_to_datepart(raw)
                        if txt:
                            # Try several formats
                            for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y:%m:%d %H:%M:%S", "%Y:%m:%d"):
                                try:
                                    dt = datetime.strptime(txt, fmt)
                                    return dt, label
                                except Exception:
                                    continue
        except Exception:
            pass

    # piexif route
    if PIEXIF_OK:
        try:
            exif_dict = piexif.load(path)
            for ifd, tag, label in (("Exif", piexif.ExifIFD.DateTimeOriginal, "piexif:DateTimeOriginal"),
                                    ("0th", piexif.ImageIFD.DateTime, "piexif:DateTime"),
                                    ("Exif", piexif.ExifIFD.DateTimeDigitized, "piexif:DateTimeDigitized")):
                try:
                    raw = exif_dict.get(ifd, {}).get(tag)
                except Exception:
                    raw = None
                txt = parse_exif_to_datepart(raw)
                if txt:
                    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y:%m:%d %H:%M:%S", "%Y:%m:%d"):
                        try:
                            dt = datetime.strptime(txt, fmt)
                            return dt, label
                        except Exception:
                            continue
        except Exception:
            pass

    # Filesystem mtime fallback
    try:
        mtime = os.path.getmtime(path)
        dt = datetime.fromtimestamp(mtime)
        return dt, "FS:mtime"
    except Exception:
        return None, "unknown"

def round_to_nearest_minute(dt: datetime) -> datetime:
    """
    Round a datetime to nearest minute (matching ROUND(...*24*60)/(24*60) behaviour).
    """
    # convert to seconds since epoch, round to nearest 60 sec
    secs = dt.timestamp()
    rounded_secs = round(secs / 60.0) * 60
    return datetime.fromtimestamp(rounded_secs)

def apply_excel_like_offset(orig_dt: datetime, sm_bucket: str) -> datetime:
    """
    Apply offsets translated from the Excel logic:
    IF(A2="SM_1",0.05833333,
       IF(A2="SM_2",IF(YEAR(D2)=2017,1866.502083,0),
       IF(A2="SM_3",IF(YEAR(D2)=2017,1866.606944,0),
       IF(A2="SM_4",IF(YEAR(D2)=2017,1866.636111,0),
       IF(A2="SM_5",IF(YEAR(D2)=2017,1866.634028,0),0)))))
    The offsets are in days. Then the result is rounded to the nearest minute.
    """
    offset_days = 0.0
    bucket = (sm_bucket or "").upper()
    yr = orig_dt.year
    if bucket == "SM_1":
        offset_days = 0.05833333
    elif bucket == "SM_2":
        offset_days = 1866.502083 if yr == 2017 else 0.0
    elif bucket == "SM_3":
        offset_days = 1866.606944 if yr == 2017 else 0.0
    elif bucket == "SM_4":
        offset_days = 1866.636111 if yr == 2017 else 0.0
    elif bucket == "SM_5":
        offset_days = 1866.634028 if yr == 2017 else 0.0
    adjusted = orig_dt + timedelta(days=offset_days)
    # Round to nearest minute (Excel formula rounds minutes)
    adjusted_rounded = round_to_nearest_minute(adjusted)
    return adjusted_rounded

def short_hash_from_meta(path: str, n: int = 6) -> str:
    """
    Lightweight short hash using size + mtime + path for disambiguation.
    (Kept as fallback; not used for running counter but available.)
    """
    try:
        size = os.path.getsize(path)
        mtime = os.path.getmtime(path)
    except Exception:
        size, mtime = 0, 0
    h = hashlib.sha1(f"{path}|{size}|{mtime}".encode("utf-8")).hexdigest()
    return h[:n]

def sha1_of_file(path: str, blocksize: int = 1 << 20) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(blocksize)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def get_next_counter() -> str:
    """
    Thread-safe global counter, returns zero-padded 6-digit string.
    """
    global _global_counter
    with counter_lock:
        _global_counter += 1
        return f"{_global_counter:06d}"

def build_dest_filename(src: str, dt: datetime, sm_bucket: str, counter_str: str, mode: str = "rich") -> str:
    """
    Build destination filename.
    Desired format (rich): SM_<bucket>_<original_stem>_<YYYYMMDD_HHMMSS>__<NNNNNN><ext>
    - Uses the adjusted and rounded datetime `dt`.
    - counter_str should be 6-digit zero-padded running number.
    If mode == "simple", we keep original filename but still prefix SM and append counter to avoid collisions:
    SM_<bucket>_<orig_name>__<NNNNNN><ext>
    """
    p = Path(src)
    if mode == "simple":
        return f"{sm_bucket}_{p.name}__{counter_str}"  # Changed from SM_{sm_bucket}
    t = dt.strftime("%Y%m%d_%H%M%S")
    return f"{sm_bucket}_{p.stem}_{t}__{counter_str}{p.suffix}"  # Changed from SM_{sm_bucket}

def unique_dest_path(dest_path: str) -> str:
    """
    If dest_path exists, append _1, _2, ... to basename until free.
    (This is a fall-back in case counter collisions or simultaneous operations.)
    """
    if not os.path.exists(dest_path):
        return dest_path
    root, ext = os.path.splitext(dest_path)
    i = 1
    while True:
        candidate = f"{root}_{i}{ext}"
        if not os.path.exists(candidate):
            return candidate
        i += 1

def discover_jobs(in_dir: str, allowed_exts: set[str], only_sm: bool, excludes: set[str]) -> List[Tuple[str, str]]:
    """
    Returns list of (abs_path, sm_bucket) for files with allowed_exts.
    - If only_sm=True, include files only if they are under an ancestor folder matching SM_*.
    - sm_bucket is the nearest ancestor dir matching SM_*; else 'SM_Unknown'.
    - Skips any path that includes a directory name in `excludes`.
    """
    jobs: List[Tuple[str, str]] = []
    in_dir_abs = os.path.abspath(in_dir)

    for root, dirs, files in os.walk(in_dir_abs):
        rel = os.path.relpath(root, in_dir_abs)
        parts = [] if rel == "." else rel.split(os.sep)

        # Exclusion filter (case-sensitive by name; adjust if you need case-insensitive)
        if any(part in excludes for part in parts):
            continue

        sm_top = None
        for p in parts:
            if SM_DIR_GLOB.match(p):
                sm_top = p
                break

        if only_sm and not sm_top:
            continue

        for fn in files:
            if is_wanted_ext(fn, allowed_exts):
                jobs.append((os.path.join(root, fn), sm_top or "SM_Unknown"))
    return jobs

In [4]:
def main():
    parser = argparse.ArgumentParser(
        description="Copy/move images into a single output folder using EXIF date/time with Excel-style corrections."
    )
    parser.add_argument("--inDir", required=True, help="Input root (e.g., '.' if you're already inside 'Camera Trap Photos')")
    parser.add_argument("--outDir", required=True, help="Output root (all images go directly in this folder)")
    parser.add_argument("--workers", type=int, default=2, help="Max worker threads (1-8 recommended). Default: 2")
    parser.add_argument("--dry-run", action="store_true", help="Show actions without copying/moving")
    parser.add_argument("--move", action="store_true", help="Move files instead of copying")
    parser.add_argument("--only-sm", action="store_true", help="Restrict to files under folders named like SM_*")
    parser.add_argument("--exts", default="jpg,jpeg", help="Comma-separated list of extensions (default: jpg,jpeg)")
    parser.add_argument("--exclude", action="append", default=[], help="Directory name to exclude (can be repeated)")
    parser.add_argument("--list-sample", type=int, default=10, help="How many sample files to print before processing")
    parser.add_argument("--rename", choices=["rich", "simple"], default="rich",
                        help="Filename mode: 'rich' = SM_bucket + stem + timestamp + counter (default), 'simple' = keep original name + counter")
    parser.add_argument("--dedupe", action="store_true",
                        help="Skip copying if an identical file already exists in the target output folder (content SHA-1; slower)")
    parser.add_argument("--quiet", action="store_true", help="Reduce per-file logging")
    args = parser.parse_args()

    workers = max(1, min(8, args.workers))
    in_dir = args.inDir
    out_dir = args.outDir
    rename_mode = args.rename
    do_move = args.move
    do_dedupe = args.dedupe
    quiet = args.quiet

    # Normalize extensions
    exts = {("." + e.strip().lower().lstrip(".")) for e in args.exts.split(",") if e.strip()}
    excludes = set(args.exclude or [])

    start = time.perf_counter()

    jobs = discover_jobs(in_dir, exts, args.only_sm, excludes)
    total = len(jobs)

    print(f"{ts()} Scan summary:")
    print(f"{ts()}   inDir: {os.path.abspath(in_dir)}")
    print(f"{ts()}   outDir: {os.path.abspath(out_dir)}")
    print(f"{ts()}   only-sm: {args.only_sm}")
    print(f"{ts()}   allowed extensions: {sorted(exts)}")
    if excludes:
        print(f"{ts()}   excludes: {sorted(excludes)}")
    print(f"{ts()}   found files: {total}")

    for i, (p, sm) in enumerate(jobs[:max(0, args.list_sample)]):
        print(f"{ts()}   sample[{i+1}]: {p} (bucket: {sm})")

    if total == 0:
        print(f"{ts()} No files found matching extensions under the given rules.")
        return

    # Ensure output directory exists (all files go directly here)
    safe_makedirs(out_dir)

    done = 0

    def process_one(job: Tuple[str, str]) -> None:
        nonlocal done
        src, sm_top = job

        if not quiet:
            print(f"{ts()} found: {src}")

        orig_dt, source = extract_capture_datetime(src)
        if not orig_dt:
            print(f"{ts()} WARNING: could not extract any datetime, skipping: {src}")
            with progress_lock:
                done += 1
                pct = int(done * 100 / total)
                print(f"{ts()} progress: {done}/{total} ({pct}%)")
            return

        if not quiet:
            print(f"{ts()} metadata: {orig_dt.isoformat(sep=' ')} ({source})")

        # Apply Excel-like offset and rounding to minute
        try:
            adjusted_dt = apply_excel_like_offset(orig_dt, sm_top)
        except Exception as e:
            print(f"{ts()} ERROR applying offset for {src}: {e}")
            adjusted_dt = orig_dt

        if not quiet:
            print(f"{ts()} adjusted datetime: {adjusted_dt.isoformat(sep=' ')}")

        # Reserve counter and build destination filename
        counter_str = get_next_counter()

        base_name = build_dest_filename(src, adjusted_dt, sm_top, counter_str, mode=rename_mode)
        
        # Create subfolder structure: location/YYYYMMDD/
        date_folder = adjusted_dt.strftime("%Y%m%d")
        dest_subdir = os.path.join(out_dir, sm_top, date_folder)
        safe_makedirs(dest_subdir)
        
        dest = os.path.join(dest_subdir, base_name)

        # Optional content dedupe (scan the output folder)
        if do_dedupe and not args.dry_run:
            try:
                src_hash = sha1_of_file(src)
                try:
                    for cand in os.listdir(out_dir):
                        cand_path = os.path.join(out_dir, cand)
                        if os.path.isfile(cand_path):
                            try:
                                if sha1_of_file(cand_path) == src_hash:
                                    if not quiet:
                                        print(f"{ts()} duplicate detected (content match), skipping: {src}")
                                    with progress_lock:
                                        done += 1
                                        pct = int(done * 100 / total)
                                        print(f"{ts()} progress: {done}/{total} ({pct}%)")
                                    return
                            except Exception:
                                pass
                except FileNotFoundError:
                    pass
            except Exception:
                # proceed if hashing fails
                pass

        # Reserve a unique path and perform copy/move under a lock to avoid races
        with dest_lock:
            dest_final = unique_dest_path(dest)
            if not quiet:
                verb = "moving" if do_move else "copying"
                print(f"{ts()} {verb} to: {dest_final}")
            if not args.dry_run:
                try:
                    if do_move:
                        shutil.move(src, dest_final)
                    else:
                        shutil.copy2(src, dest_final)
                except Exception as e:
                    print(f"{ts()} ERROR {'moving' if do_move else 'copying'} '{src}' -> '{dest_final}': {e}")

        with progress_lock:
            done += 1
            pct = int(done * 100 / total)
            if not quiet or done % 100 == 0 or done == total:
                print(f"{ts()} progress: {done}/{total} ({pct}%)")

    print(f"{ts()} Starting with {workers} worker(s).")
    with futures.ThreadPoolExecutor(max_workers=workers) as ex:
        futs = [ex.submit(process_one, job) for job in jobs]
        for f in futures.as_completed(futs):
            exc = f.exception()
            if exc:
                print(f"{ts()} ERROR: {exc}", file=sys.stderr)

    elapsed = time.perf_counter() - start
    hrs = int(elapsed // 3600)
    mins = int((elapsed % 3600) // 60)
    secs = int(elapsed % 60)
    print(f"{ts()} DONE. Total time spent: {hrs:02d}:{mins:02d}:{secs:02d}")

In [5]:
!python -c "import os; print('cwd=', os.getcwd()); print('abs inDir=', os.path.abspath('.'))"


cwd= /home/hice1/ssinha348
abs inDir= /home/hice1/ssinha348


In [6]:
sys.argv = [
    "Images_directories_correction_with_capture_time_and_unique_ID_v4.ipynb",
    "--inDir", "/home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos",
    "--outDir", "/home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/Processed_Images",
    "--workers", "2",
#     "--move",
    "--rename", "rich",
    "--exts", "jpg,jpeg",
    "--quiet"
]
main()

[20251102 02:58:23] Scan summary:
[20251102 02:58:23]   inDir: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos
[20251102 02:58:23]   outDir: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/Processed_Images
[20251102 02:58:23]   only-sm: False
[20251102 02:58:23]   allowed extensions: ['.jpeg', '.jpg']
[20251102 02:58:23]   found files: 56472
[20251102 02:58:23]   sample[1]: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/SM_1/2022/Jul 27/IMG_0141.JPG (bucket: SM_1)
[20251102 02:58:23]   sample[2]: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/SM_1/2022/Jul 27/IMG_0792.JPG (bucket: SM_1)
[20251102 02:58:23]   sample[3]: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/SM_1/2022/Jul 27/IMG_1029.JPG (bucket: SM_1)
[20251102 02:58:23]   sample[4]: /home/hice1/ssinha348/scratch/stonemt_cameratrap/Camera Trap Photos/SM_1/2022/Jul 27/IMG_0872.JPG (bucket: SM_1)
[20251102 02:58:23]   sample[5]:

[20251102 03:02:18] progress: 13600/56472 (24%)
[20251102 03:02:20] progress: 13700/56472 (24%)
[20251102 03:02:22] progress: 13800/56472 (24%)
[20251102 03:02:23] progress: 13900/56472 (24%)
[20251102 03:02:25] progress: 14000/56472 (24%)
[20251102 03:02:28] progress: 14100/56472 (24%)
[20251102 03:02:30] progress: 14200/56472 (25%)
[20251102 03:02:32] progress: 14300/56472 (25%)
[20251102 03:02:34] progress: 14400/56472 (25%)
[20251102 03:02:36] progress: 14500/56472 (25%)
[20251102 03:02:38] progress: 14600/56472 (25%)
[20251102 03:02:40] progress: 14700/56472 (26%)
[20251102 03:02:42] progress: 14800/56472 (26%)
[20251102 03:02:44] progress: 14900/56472 (26%)
[20251102 03:02:46] progress: 15000/56472 (26%)
[20251102 03:02:48] progress: 15100/56472 (26%)
[20251102 03:02:50] progress: 15200/56472 (26%)
[20251102 03:02:51] progress: 15300/56472 (27%)
[20251102 03:02:53] progress: 15400/56472 (27%)
[20251102 03:02:55] progress: 15500/56472 (27%)
[20251102 03:02:57] progress: 15600/5647

[20251102 03:08:42] progress: 30700/56472 (54%)
[20251102 03:08:44] progress: 30800/56472 (54%)
[20251102 03:08:47] progress: 30900/56472 (54%)
[20251102 03:08:50] progress: 31000/56472 (54%)
[20251102 03:08:53] progress: 31100/56472 (55%)
[20251102 03:08:56] progress: 31200/56472 (55%)
[20251102 03:08:58] progress: 31300/56472 (55%)
[20251102 03:09:01] progress: 31400/56472 (55%)
[20251102 03:09:04] progress: 31500/56472 (55%)
[20251102 03:09:07] progress: 31600/56472 (55%)
[20251102 03:09:10] progress: 31700/56472 (56%)
[20251102 03:09:12] progress: 31800/56472 (56%)
[20251102 03:09:15] progress: 31900/56472 (56%)
[20251102 03:09:18] progress: 32000/56472 (56%)
[20251102 03:09:20] progress: 32100/56472 (56%)
[20251102 03:09:23] progress: 32200/56472 (57%)
[20251102 03:09:26] progress: 32300/56472 (57%)
[20251102 03:09:28] progress: 32400/56472 (57%)
[20251102 03:09:31] progress: 32500/56472 (57%)
[20251102 03:09:34] progress: 32600/56472 (57%)
[20251102 03:09:36] progress: 32700/5647

[20251102 03:16:30] progress: 47800/56472 (84%)
[20251102 03:16:33] progress: 47900/56472 (84%)
[20251102 03:16:36] progress: 48000/56472 (84%)
[20251102 03:16:39] progress: 48100/56472 (85%)
[20251102 03:16:42] progress: 48200/56472 (85%)
[20251102 03:16:44] progress: 48300/56472 (85%)
[20251102 03:16:46] progress: 48400/56472 (85%)
[20251102 03:16:48] progress: 48500/56472 (85%)
[20251102 03:16:50] progress: 48600/56472 (86%)
[20251102 03:16:52] progress: 48700/56472 (86%)
[20251102 03:16:54] progress: 48800/56472 (86%)
[20251102 03:16:56] progress: 48900/56472 (86%)
[20251102 03:16:58] progress: 49000/56472 (86%)
[20251102 03:17:01] progress: 49100/56472 (86%)
[20251102 03:17:04] progress: 49200/56472 (87%)
[20251102 03:17:07] progress: 49300/56472 (87%)
[20251102 03:17:10] progress: 49400/56472 (87%)
[20251102 03:17:13] progress: 49500/56472 (87%)
[20251102 03:17:16] progress: 49600/56472 (87%)
[20251102 03:17:19] progress: 49700/56472 (88%)
[20251102 03:17:22] progress: 49800/5647