In [None]:
#EXIF Capture Date (for photos) → fallback to modified date (for videos/others).

#Duplicate Detection:

#First check (file size + capture date)

#If same → confirm with SHA256 hash

#If hash matches → duplicate → delete.

#If hash differs → keep both (different content).

#Organizes into folders: Year/Month structure.

#Deletes originals after moving.

#Optional CSV log with actions taken.


import os
import shutil
import csv
import calendar
import hashlib
from datetime import datetime
from PIL import Image
from PIL.ExifTags import TAGS

def get_media_date(file_path):
    """Get capture date from EXIF (for images) or fallback to modified date (for videos)."""
    try:
        img = Image.open(file_path)
        exif_data = img._getexif()
        if exif_data:
            for tag, value in exif_data.items():
                tag_name = TAGS.get(tag)
                if tag_name == "DateTimeOriginal":
                    dt = datetime.strptime(value, "%Y:%m:%d %H:%M:%S")
                    return dt
    except Exception:
        pass
    return datetime.fromtimestamp(os.path.getmtime(file_path))


def file_hash(file_path, chunk_size=8192):
    """Generate SHA256 hash for a file (used to confirm duplicates)."""
    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()


def organize_media_with_hash(source_dir, destination_dir, log_file=None):
    extensions = (
        '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp',  # Images
        '.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm', '.mt4'  # Videos + mt4
    )

    os.makedirs(destination_dir, exist_ok=True)
    moved_count, duplicate_count = 0, 0
    seen_files = {}  # key=(size, capture_date), value=hash

    log_data = []

    for root, dirs, files in os.walk(source_dir):
        if any(x.lower() in root.lower() for x in [
            'windows', 'program files', 'appdata', '$recycle.bin', 'system volume information'
        ]):
            continue

        for file in files:
            if file.lower().endswith(extensions):
                src_path = os.path.join(root, file)

                try:
                    size = os.path.getsize(src_path)
                    capture_date = get_media_date(src_path)
                    sig_key = (size, capture_date.strftime("%Y-%m-%d %H:%M:%S"))

                    # Hash check if size + date seen before
                    if sig_key in seen_files:
                        filehash = file_hash(src_path)
                        if filehash == seen_files[sig_key]:
                            duplicate_count += 1
                            print(f"⏩ Duplicate removed: {src_path}")
                            os.remove(src_path)
                            if log_file:
                                log_data.append([src_path, "DUPLICATE - Removed", size, capture_date])
                            continue
                        else:
                            # Same size+date but different content (keep it)
                            seen_files[sig_key] = file_hash(src_path)

                    else:
                        seen_files[sig_key] = file_hash(src_path)

                    # Organize into Year/Month folders
                    year, month = capture_date.year, capture_date.month
                    month_name = calendar.month_name[month]
                    month_folder = f"{month:02d}_{month_name}"

                    dest_folder = os.path.join(destination_dir, str(year), month_folder)
                    os.makedirs(dest_folder, exist_ok=True)

                    dest_path = os.path.join(dest_folder, file)

                    # If name conflict, keep the latest
                    if os.path.exists(dest_path):
                        existing_mtime = os.path.getmtime(dest_path)
                        if capture_date.timestamp() > existing_mtime:
                            os.remove(dest_path)  # replace older version
                        else:
                            duplicate_count += 1
                            print(f"⏩ Skipped older conflict: {src_path}")
                            continue

                    shutil.move(src_path, dest_path)
                    moved_count += 1

                    if log_file:
                        log_data.append([src_path, dest_path, size, capture_date])

                except Exception as e:
                    print(f"❌ Failed to process {src_path}: {e}")

    if log_file:
        with open(log_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Original Path", "New Path / Status", "Size (bytes)", "Capture Date"])
            writer.writerows(log_data)

    print(f"\n✅ Total media files moved: {moved_count}")
    print(f"⚠️  Total duplicates removed: {duplicate_count}")
    print(f"📂 Organized media is in: {destination_dir}")
    if log_file:
        print(f"📝 Log file saved at: {log_file}")


# ------------------- MENU -------------------
if __name__ == "__main__":
    print("📦 Media Organizer with Duplicate Removal (Size + Date + Hash)\n")
    source = input("Enter the SOURCE directory to scan: ").strip('"')
    dest = input("Enter the DESTINATION directory to save organized files: ").strip('"')

    log_choice = input("Do you want to generate a CSV log? (y/n): ").lower()
    log_file = None
    if log_choice == 'y':
        log_file = os.path.join(dest, "media_log.csv")

    organize_media_with_hash(source, dest, log_file)
