STEP 1: 모든 파일 복사 및 이름 변경
python
코드 복사


# import

In [6]:
import os
import re
import json
import shutil
import hashlib
import datetime
import filetype
import subprocess
from pathlib import Path
from typing import Optional, Tuple

# image/video
import imghdr
import cv2
from PIL import Image
from tqdm import tqdm

EXIFTOOL_PATH = "./exiftool-13.11_64/exiftool.exe"


# Move all files to one folder

In [7]:
def copy_files_with_folder_info(input_folder: str, output_folder: str) -> list:
    copied_files = []
    os.makedirs(output_folder, exist_ok=True)

    # 전체 파일 개수 계산
    total_files = sum(len(files) for _, _, files in os.walk(input_folder))

    # tqdm을 사용하여 진행 상태 표시
    with tqdm(total=total_files, desc="Copying files", unit="file") as pbar:
        for root, dirs, files in os.walk(input_folder):
            for file in files:
                src_path = os.path.join(root, file)
                
                # 원본 폴더 경로 정보를 파일명에 포함
                relative_path = os.path.relpath(root, input_folder)
                folder_info = relative_path.replace(os.sep, "-")  # 경로 구분자를 '-'로 변경
                new_filename = f"{Path(file).stem}_folder-{folder_info}{Path(file).suffix}"
                
                # 파일명 중복 처리
                dest_path = Path(output_folder) / new_filename
                counter = 1
                while dest_path.exists():
                    stem = dest_path.stem
                    suffix = dest_path.suffix
                    dest_path = Path(output_folder) / f"{stem} ({counter}){suffix}"
                    counter += 1
                
                # 파일 복사
                try:
                    shutil.copy2(src_path, dest_path)
                    copied_files.append(str(dest_path))
                except Exception as e:
                    print(f"[ERROR] Failed to copy {src_path} to {dest_path}: {e}")
                
                pbar.update(1)  # 진행 바 업데이트

    return copied_files


# Delete uselss files ( delete . and 0 bit files)

In [8]:
def filter_files(directory: str):
    dir_path = Path(directory)  # Convert the string directory path to a Path object
    
    # Check if the directory exists
    if not dir_path.is_dir():
        print(f"[ERROR] {directory} is not a valid directory.")
        return
    
    deleted_count = 0  # Counter for successfully deleted files
    failed_count = 0   # Counter for failed deletions
    processed_count = 0  # Counter for processed files
    
    # Iterate over all files in the directory (including subdirectories if needed)
    for p in dir_path.iterdir():
        try:
            # Only process files (not directories)
            if p.is_file():
                processed_count += 1  # Increment processed files count
                if p.stat().st_size == 0 or p.name.startswith(('.', '._')):  # Check for empty or unwanted files
                    try:
                        p.unlink()  # Delete the file
                        deleted_count += 1  # Increment deleted count
                        #print(f"[DELETE] Removed {p}")
                    except Exception as e:
                        failed_count += 1  # Increment failed deletion count
                        print(f"[ERROR] Failed to delete {p}: {e}")
        except Exception as e:
            print(f"[ERROR] Failed to process {p}: {e}")
    
    # Summary
    print("\nProcessing Summary:")
    print(f"Total files processed: {processed_count}")
    print(f"Files successfully deleted: {deleted_count}")
    print(f"Files failed to delete: {failed_count}")



# Fix image extension if mismatch

In [9]:

def fix_image_extension_if_mismatch(file_path: str) -> str:
    """
    This part exsist since filetype can not recognise json files.
    filetype 라이브러리를 사용하여 파일 확장자가 실제 파일 형식과 일치하지 않는 경우 수정합니다.
    **JPG 파일이 실제 JSON 데이터인 경우를 먼저 확인하여 .json 확장자로 변경하는 기능**을 포함합니다.
    이미지, 동영상, 문서 등 다양한 파일 형식을 지원합니다.

    Args:
        file_path: 파일 경로

    Returns:
        수정된 파일 경로 (확장자가 변경되었거나, 변경이 없었다면 원래 경로)
    """
    try:
        p = Path(file_path)
        original_suffix_lower = p.suffix.lower()

        # **1. JPG/JPEG 파일이 JSON인지 먼저 확인 (filetype 기반 함수에 통합)**
        if original_suffix_lower == ".jpg" or original_suffix_lower == ".jpeg":
            is_json = False
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json.load(f)
                is_json = True
            except json.JSONDecodeError:
                pass
            except UnicodeDecodeError:
                pass
            except Exception:
                pass

            if is_json: # JSON 파일로 확인된 경우
                json_ext = ".json"
                base_path = p.with_suffix(json_ext)
                candidate = base_path
                counter = 1
                while candidate.exists():
                    candidate = p.with_name(f"{p.stem} ({counter}){json_ext}")
                    counter += 1
                new_path = candidate

                try:
                    p.rename(new_path)
                    #print(f"[RENAME] JPG->JSON: {file_path} -> {new_path}")
                    return str(new_path) # JSON으로 변경 후 새 경로 반환
                except Exception as e:
                    print(f"[ERROR] Rename failed (JPG->JSON): {file_path} -> {new_path}, {e}")
                    return file_path # JSON 리네임 실패 시 원래 경로 반환

        # **2. filetype으로 파일 형식 감지 (JSON으로 변경되지 않은 경우)**
        kind = filetype.guess(file_path)

        if kind is not None: # 파일 형식을 감지한 경우
            detected_extension = f".{kind.extension}"

            if original_suffix_lower != detected_extension: # 확장자가 일치하지 않는 경우
                base_path = p.with_suffix(detected_extension)
                candidate = base_path
                counter = 1
                while candidate.exists(): # 새 파일 이름이 이미 존재하는 경우 숫자 증가
                    candidate = p.with_name(f"{p.stem} ({counter}){detected_extension}")
                    counter += 1
                new_path = candidate

                try:
                    p.rename(new_path)
                    #print(f"[RENAME] Extension fixed using filetype: {file_path} -> {new_path}")
                    return str(new_path)
                except Exception as e:
                    print(f"[ERROR] Rename failed using filetype: {file_path} -> {new_path}, {e}")
                    return file_path # 리네임 실패 시 원래 경로 반환
            else:
                #print(f"[INFO] Extension already matches detected file type (filetype): {file_path}") # 확장자가 이미 일치하는 경우 정보 메시지
                return file_path # 확장자가 이미 일치하므로 원래 경로 반환

        else: # filetype.guess()가 파일 형식을 감지하지 못한 경우 (None 반환)
            #print(f"[WARNING] File type detection failed (filetype), keeping original extension: {file_path}")
            return file_path # 파일 형식 감지 실패 시 원래 경로 반환

    except FileNotFoundError:
        print(f"[ERROR] File not found: {file_path}") # 파일이 존재하지 않을 경우 에러 처리
        return file_path
    except Exception as e:
        print(f"[ERROR] An error occurred with file (filetype): {file_path}, {e}")
        return file_path

    return file_path # 예외 발생 없이, 확장자 변경도 없었을 경우 원래 경로 반환 (코드 흐름 상 도달 X, 명시적으로 추가)


# Get Date from Json


In [10]:
def get_datetime_from_json_sidecar(json_file_path: str) -> Optional[datetime.datetime]:
    try:
        import json
        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        ts_str = data.get("photoTakenTime", {}).get("timestamp")
        if ts_str:
            return datetime.datetime.fromtimestamp(int(ts_str))
        return None
    except Exception as e:
        print(f"[ERROR] get_datetime_from_json_sidecar: {e}")
        return None


# Date from file name

In [11]:

def extract_datetime_from_filename(filename: str, folder_path: str) -> Optional[datetime.datetime]:
    """
    다양한 형식의 파일명에서 날짜/시간을 추출하는 함수.
    - 우선순위: 파일명 패턴을 통해 날짜 추론.
    - 마지막 수단: 파일명에 '_folder-'가 포함되어 있으면, 해당 정보로 'YYYY' 또는 'YYYYMM' 형태의 날짜를 추정.
    """
    try:
        # 1) 스크린샷 패턴: "YYYY-MM-DD HH.MM.SS"
        m = re.search(r"(\d{4})-(\d{2})-(\d{2})\s+(\d{2})\.(\d{2})\.(\d{2})", filename)
        if m:
            return datetime.datetime(*map(int, m.groups()))
        
        # 2) KakaoTalk_Photo 패턴: "YYYY-MM-DD-HH-MM-SS"
        m = re.search(r"(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})", filename)
        if m:
            return datetime.datetime(*map(int, m.groups()))
        
        # 3) Resized 이미지 패턴: "YYYYMMDD_HHMMSS"
        m = re.search(r"(\d{8})_(\d{6})", filename)
        if m:
            date_str, time_str = m.groups()
            return datetime.datetime(
                int(date_str[0:4]), int(date_str[4:6]), int(date_str[6:8]),
                int(time_str[0:2]), int(time_str[2:4]), int(time_str[4:6])
            )
        
        # 4) KakaoTalk_YYYYMMDD_HHMMSS 패턴
        m = re.search(r"KakaoTalk_(\d{8})_(\d{6})", filename)
        if m:
            date_str, time_str = m.groups()
            return datetime.datetime(
                int(date_str[0:4]), int(date_str[4:6]), int(date_str[6:8]),
                int(time_str[0:2]), int(time_str[2:4]), int(time_str[4:6])
            )
        
        # 5) 문서, 명함, 영수증 등 "YYYY-MM-DD_HHMMSS" 패턴
        m = re.search(r"(\d{4})-(\d{2})-(\d{2})_(\d{6})", filename)
        if m:
            y, mo, d, t = m.groups()
            return datetime.datetime(
                int(y), int(mo), int(d),
                int(t[0:2]), int(t[2:4]), int(t[4:6])
            )
        
        # 6) 간단히 YYYY-MM-DD 혹은 YYYY_MM_DD 추출
        m = re.search(r"(\d{4})[-_](\d{2})[-_](\d{2})", filename)
        if m:
            return datetime.datetime(*map(int, m.groups()))
        
        # 기존 형식에 매칭되지 않으면 MMDD 형식 (YYYYMMDD) 을 확인
        m = re.search(r"(\d{4})(\d{2})(\d{2})", filename)
        if m:
            year, month, day = map(int, m.groups())
            # 시간 정보를 00:00:00 으로 설정하여 datetime 객체 생성
            return datetime.datetime(year, month, day, 0, 0, 0)
        
        # === 마지막(최후의) 수단: _folder-YYYY 혹은 _folder-YYYYMM ===
        m = re.search(r"_folder-(\d{4})(\d{2})?", filename)
        if m:
            year = int(m.group(1))
            # 월이 없는 경우(YYYY만 있는 경우)는 1월로 처리
            month = int(m.group(2)) if m.group(2) else 1
            # 일은 확인할 수 없으므로 1일로 지정
            return datetime.datetime(year, month, 1)
        
        # 아무 패턴에도 매칭되지 않을 경우 None
        return None

    except Exception as e:
        # 원치 않는 예외가 발생하면 로그 남기고 None
        # print(f"[ERROR] extract_datetime_from_filename: {e}")
        return None


# Update metadata with datetime

In [12]:
def update_metadata_with_datetime(file_path: str, dt: datetime.datetime) -> str:
    if dt is None:
        return file_path
    exiftool_datetime = dt.strftime("%Y:%m:%d %H:%M:%S")
    file_path_fixed = file_path.replace("\\", "/")
    original_filename = os.path.basename(file_path_fixed)
    original_dir = os.path.dirname(file_path_fixed)
    # 1. 임시 파일 이름 생성 (8자리 랜덤 숫자+알파벳)
    while True:
        random_filename = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
        temp_file_path = os.path.join(original_dir, random_filename)
        if not os.path.exists(temp_file_path): # 2. 파일 이름 중복 확인
            break
    try:
        # 3. 파일 이름 임시 변경
        os.rename(file_path_fixed, temp_file_path)

        cmd = [
            EXIFTOOL_PATH,
            f"-AllDates={exiftool_datetime}",
            "-overwrite_original",
            temp_file_path
        ]
        # Run the subprocess and handle Unicode errors if any
        proc = subprocess.run(cmd, capture_output=True, text=True, errors="replace")
        if proc.returncode == 0:
            # 5. 파일 이름 원래대로 복구 (성공 시)
            os.rename(temp_file_path, file_path_fixed)
            return file_path_fixed
        else:
            # 5. 파일 이름 원래대로 복구 (에러 발생 시에도 복구하여 원상태 유지)
            print(f"{original_filename} 파일에서 ExifTool 에러 발생: {proc.stderr.strip()}") # 에러 메시지 출력
            os.rename(temp_file_path, file_path_fixed)
            return None # 에러 발생시 None 반환
    except Exception as e:
        print(f"{original_filename} 파일 처리 중 일반 에러 발생: {e}") # 에러 메시지 출력
        # 5. 오류 발생 시 파일 이름이 변경되었을 수 있으므로, 원래대로 복구하는 안전 장치
        if os.path.exists(temp_file_path) and not os.path.exists(file_path_fixed):
            os.rename(temp_file_path, file_path_fixed) # 복구 시도
        return None # 에러 발생시 None 반환

# Copy all files to a single folder
You can do it with out this, Its just for safety of your files

In [None]:
input("Top here and double check your input ouput dir(press enter)")

In [None]:
input_folder = "G:/1BASE/2PHOTO"
output_folder = "G:/1BASE/2PHOTO Total"

In [13]:
copied_files = copy_files_with_folder_info(input_folder, output_folder)

NameError: name 'input_folder' is not defined

# Remove 0 bit files

In [None]:
filter_files(output_folder)

In [None]:
# 초기 통계 변수


# VALID_EXTENSIONS = {
#     # 이미지 파일 확장자
#     ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".heic", ".tiff", ".webp",
#     # 동영상 파일 확장자
#     ".mp4", ".mov", ".avi", ".mkv", ".mts", ".wmv", ".3gp", ".mpg", ".mpeg"
# }

# 처리 함수
def process_folder(folder_path: str):
    
    extension_corrections = 0
    json_attempts = 0
    json_success = 0
    filename_attempts = 0
    filename_success = 0
    metadata_updates = 0
    total_files_processed = 0
    
    folder = Path(folder_path)
    if not folder.is_dir():
        print(f"[ERROR] The path {folder_path} is not a valid directory.")
        return

    # 폴더 내 모든 파일 처리
    for file_path in tqdm(folder.rglob("*"), desc="Processing metadata", unit="file"):
        if not file_path.is_file():
            continue  # 디렉토리 스킵

        total_files_processed += 1
        # if file_path.suffix.lower() not in VALID_EXTENSIONS:
        #     continue  # 유효하지 않은 확장자는 스킵

        # 확장자 교정 적용
        old_path = str(file_path)
        if file_path.suffix.lower() == ".json":
            continue  # JSON 파일은 스킵
        else:
            new_path = fix_image_extension_if_mismatch(old_path)
            if new_path != old_path:
                extension_corrections += 1
            file_path = Path(new_path)

            if new_path == "garbage_file_deleted":
                continue

            # Check if DateTimeOriginal OR MediaCreateDate exsist

            try:
                # 1) Check if DateTimeOriginal exists
                cmd_dt = [EXIFTOOL_PATH, "-DateTimeOriginal", "-s3", str(file_path)]
                proc_dt = subprocess.run(cmd_dt, capture_output=True, text=True, errors='replace')
                dtoriginal = proc_dt.stdout.strip()

                if dtoriginal:
                    # If DateTimeOriginal is present, skip this file
                    continue
                
                # 2) If no DateTimeOriginal, check *MediaCreateDate*
                cmd_mcd = [EXIFTOOL_PATH, "-MediaCreateDate", "-s3", str(file_path)]
                proc_mcd = subprocess.run(cmd_mcd, capture_output=True, text=True, errors='replace')
                mediacreatedate = proc_mcd.stdout.strip()

                if mediacreatedate:
                    # If MediaCreateDate is present, skip this file
                    continue

            except Exception as e:
                print(f"Failed reading metadata for {file_path}: {e}")
                continue  # Optionally skip further processing on error


            # JSON에서 날짜 추출
            json_path = str(file_path) + ".json"
            dt = None
            if os.path.isfile(json_path):
                json_attempts += 1
                dt = get_datetime_from_json_sidecar(json_path)
                if dt:
                    json_success += 1

            # 파일명에서 날짜 추출
            if dt is None:
                filename_attempts += 1
                dt = extract_datetime_from_filename(file_path.name, str(file_path.parent))
                if dt:
                    filename_success += 1
            # 메타데이터 업데이트
            if dt:
                check = update_metadata_with_datetime(str(file_path), dt)
                print(check)
                metadata_updates += 1
                if check == "garbage_file_deleted":
                    continue
                    
    # 결과 출력
    print("[INFO] Processing complete.")
    print(f"Total files processed: {total_files_processed}")
    print(f"Extension corrections made: {extension_corrections}")
    print(f"JSON attempts: {json_attempts}, JSON successes: {json_success}")
    print(f"Filename extraction attempts: {filename_attempts}, Filename extraction successes: {filename_success}")
    print(f"Metadata updates attempted: {metadata_updates}")

# Real Run

In [None]:
process_folder(output_folder)

In [None]:
def remove_folder_info(folder_path: str) -> None:
    """
    Removes '_folder-' suffix from filenames in the specified directory.
    Handles naming conflicts by adding number suffixes.
    
    Args:
        folder_path: String path to the directory to process
    """
    folder = Path(folder_path)
    
    files = list(folder.iterdir())
    for file_path in tqdm(files, desc="Processing Files"):
        if "_folder-" in file_path.name:
            stem = file_path.stem
            suffix = file_path.suffix
            base_name_stem = re.sub(r"_folder-.*", "", stem)
            base_name = base_name_stem + suffix
            new_path = folder / base_name
            
            counter = 1
            while new_path.exists():
                new_path = folder / f"{base_name_stem}_{counter}{suffix}"
                counter += 1
            
            file_path.rename(new_path)

# Remove folder from name

In [None]:
remove_folder_info(output_folder)

# duplicates

In [1]:
import os
import cv2
import numpy as np
import random
import hashlib
from PIL import Image
from IPython.display import clear_output
import matplotlib.pyplot as plt
import shutil
import uuid  # for random unique suffix if needed
from tqdm import tqdm

###############################################################################
# 1) LOADING FILES (이미지 + 동영상)
###############################################################################

def load_files(folder_path):
    """
    Recursively loads all image and video file paths from a folder (and subfolders).
    """
    supported_images = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff','heic')
    supported_videos = ('.mp4', '.mov', '.avi', '.mkv', '.wmv', '.flv', '.webm')
    
    file_paths = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            lower_file = file.lower()
            if lower_file.endswith(supported_images) or lower_file.endswith(supported_videos):
                file_paths.append(os.path.join(root, file))
    
    return file_paths

def is_video_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    video_exts = ('.mp4', '.mov', '.avi', '.mkv', '.wmv', '.flv', '.webm')
    return ext in video_exts

def is_image_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    image_exts = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff','.heic')
    return ext in image_exts

###############################################################################
# 2) IMAGE pHash & VIDEO SIGNATURE
###############################################################################

def compute_image_phash(image_path):
    """
    Computes a perceptual hash (pHash) for an image using OpenCV.
    """
    try:
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            return None
        image = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
        dct = cv2.dct(np.float32(image))
        dct_roi = dct[0:8, 0:8]
        median_val = np.median(dct_roi)
        phash = ''.join('1' if px > median_val else '0'
                        for row in dct_roi for px in row)
        return phash
    except:
        return None

def compute_video_signature(video_path):
    """
    A naive signature for videos:
      - file size (bytes)
      - duration (seconds)
      - pHash of the middle frame
    """
    try:
        file_size = os.path.getsize(video_path)
        
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return None
        
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        if frame_count > 0 and fps > 0:
            duration = frame_count / fps
        else:
            duration = 0
        
        # Middle frame
        mid_index = int(frame_count // 2)
        cap.set(cv2.CAP_PROP_POS_FRAMES, mid_index)
        ret, frame = cap.read()
        
        if not ret or frame is None:
            # fallback: first frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            ret, frame = cap.read()
        cap.release()
        
        if frame is not None:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            gray = cv2.resize(gray, (8, 8), interpolation=cv2.INTER_AREA)
            dct = cv2.dct(np.float32(gray))
            dct_roi = dct[0:8, 0:8]
            median_val = np.median(dct_roi)
            phash = ''.join('1' if px > median_val else '0'
                            for row in dct_roi for px in row)
        else:
            phash = None
        
        return {
            'size': file_size,
            'duration': duration,
            'frame_phash': phash
        }
    except:
        return None

def compute_file_signature(filepath):
    """
    For images, returns { 'type': 'image', 'hash': <pHash>, 'size': <bytes> }
    For videos, returns { 'type': 'video', 'size': <bytes>, 'duration': <float>, 'frame_phash': <hash> }
    """
    if is_image_file(filepath):
        phash = compute_image_phash(filepath)
        size = os.path.getsize(filepath)
        return {
            'type': 'image',
            'hash': phash,
            'size': size
        }
    elif is_video_file(filepath):
        vinfo = compute_video_signature(filepath)
        if vinfo is None:
            return None
        return {
            'type': 'video',
            'size': vinfo['size'],
            'duration': vinfo['duration'],
            'frame_phash': vinfo['frame_phash']
        }
    else:
        return None

###############################################################################
# 3) SHA-256 CHECK & SIMILARITY CALC
###############################################################################

def hamming_distance(h1, h2):
    if not h1 or not h2:
        return 64
    return sum(ch1 != ch2 for ch1, ch2 in zip(h1, h2))

def compute_sha256(file_path, chunk_size=65536):
    """
    Computes SHA-256 hash for the file to check if bit-for-bit identical.
    """
    sha = hashlib.sha256()
    try:
        with open(file_path, 'rb') as f:
            while True:
                data = f.read(chunk_size)
                if not data:
                    break
                sha.update(data)
        return sha.hexdigest()
    except:
        return None

def get_similarity(sig1, sig2, path1, path2, check_threshold):
    """
    1) If same type (image-image or video-video), do a pHash-based (or video-based) comparison
    2) If < check_threshold but file sizes are identical => do SHA-256 check => if identical, treat as 100%.
    3) If different types => 0%
    """
    if not sig1 or not sig2:
        return 0.0
    
    if sig1['type'] != sig2['type']:
        return 0.0
    
    # Image vs Image
    if sig1['type'] == 'image' and sig2['type'] == 'image':
        dist = hamming_distance(sig1['hash'], sig2['hash'])
        phash_sim = (64 - dist) / 64 * 100
        if phash_sim >= check_threshold:
            return phash_sim
        else:
            # fallback: file size same => check SHA
            if sig1['size'] == sig2['size']:
                sha1 = compute_sha256(path1)
                sha2 = compute_sha256(path2)
                if sha1 and sha2 and sha1 == sha2:
                    return 100.0
            return phash_sim
    
    # Video vs Video
    if sig1['type'] == 'video' and sig2['type'] == 'video':
        ph1 = sig1.get('frame_phash')
        ph2 = sig2.get('frame_phash')
        if ph1 and ph2:
            dist = hamming_distance(ph1, ph2)
            frame_sim = (64 - dist) / 64 * 100
        else:
            frame_sim = 0
        
        size1, size2 = sig1['size'], sig2['size']
        dur1, dur2 = sig1.get('duration', 0), sig2.get('duration', 0)
        
        # size similarity
        if size1 == 0 or size2 == 0:
            size_similarity = 0
        else:
            size_diff = abs(size1 - size2)
            max_size = max(size1, size2)
            size_penalty = (size_diff / max_size) * 100
            size_similarity = 100 - size_penalty
            if size_similarity < 0:
                size_similarity = 0
        
        # duration similarity
        if dur1 == 0 or dur2 == 0:
            dur_similarity = 0
        else:
            dur_diff = abs(dur1 - dur2)
            max_dur = max(dur1, dur2)
            dur_penalty = (dur_diff / max_dur) * 100
            dur_similarity = 100 - dur_penalty
            if dur_similarity < 0:
                dur_similarity = 0
        
        combined_sim = (frame_sim + size_similarity + dur_similarity) / 3
        
        if combined_sim >= check_threshold:
            return combined_sim
        else:
            if size1 == size2 and size1 != 0:
                sha1 = compute_sha256(path1)
                sha2 = compute_sha256(path2)
                if sha1 and sha2 and sha1 == sha2:
                    return 100.0
            return combined_sim
    
    return 0.0

###############################################################################
# 4) QUALITY + TIE-BREAK (NAME POLICY)
###############################################################################

def get_file_quality(sig):
    """
    Images => file size
    Videos => file_size + 1000 * duration
    """
    if not sig:
        return 0
    if sig['type'] == 'image':
        return sig['size']
    elif sig['type'] == 'video':
        return sig['size'] + 1000 * sig.get('duration', 0)
    return 0

def is_original_name(filename):
    """
    Returns True if there's *no* sign of copy (like (1), 복사본, copy, etc.)
    """
    lower = filename.lower()
    # If "copy" or "복사본" is in name => false
    if "copy" in lower or "복사본" in lower:
        return False
    # If there's a parenthesis with number => likely a copy
    if "(" in lower and ")" in lower:
        return False
    
    return True

def pick_lower_quality_or_tiebreak(path1, path2, sig1, sig2):
    """
    If qualities differ => remove the lower one.
    If tie => keep the file that looks like the 'original' name, otherwise random.
    """
    q1 = get_file_quality(sig1)
    q2 = get_file_quality(sig2)
    
    if q1 < q2:
        return path1
    elif q2 < q1:
        return path2
    else:
        # tie => check name
        orig1 = is_original_name(os.path.basename(path1))
        orig2 = is_original_name(os.path.basename(path2))
        if orig1 and not orig2:
            return path2
        elif orig2 and not orig1:
            return path1
        else:
            return random.choice([path1, path2])

###############################################################################
# 5) MOVING DUPLICATES INSTEAD OF DELETING
###############################################################################

def move_file_to_duplicates(file_path, duplicates_folder):
    """
    Moves the file to the 'duplicates_folder'. 
    If there's a collision, we append a random suffix to the filename.
    """
    if not os.path.exists(duplicates_folder):
        os.makedirs(duplicates_folder, exist_ok=True)
    
    filename = os.path.basename(file_path)
    destination = os.path.join(duplicates_folder, filename)
    
    # If a file with the same name already exists in duplicates_folder, rename
    if os.path.exists(destination):
        # E.g., insert a unique suffix: "filename (uuid4).ext"
        name, ext = os.path.splitext(filename)
        new_filename = f"{name} ({uuid.uuid4().hex[:6]}){ext}"
        destination = os.path.join(duplicates_folder, new_filename)
    
    shutil.move(file_path, destination)
    print(f"Moved => {destination}")

###############################################################################
# 6) MAIN LOGIC: DETECT & MOVE DUPLICATES
###############################################################################

def get_video_frame(video_path, fraction=0.5):
    """
    Extract a frame at 'fraction' (0..1) of the video length for display
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    target = int(frame_count * fraction)
    cap.set(cv2.CAP_PROP_POS_FRAMES, target)
    ret, frame = cap.read()
    cap.release()
    if not ret:
        return None
    return frame

def show_files_side_by_side(path1, path2, sig1, sig2, similarity):
    clear_output(wait=True)
    
    print(f"Similarity = {similarity:.2f}%")
    print(f"File 1: {os.path.basename(path1)}  ({sig1['type']})")
    print(f"File 2: {os.path.basename(path2)}  ({sig2['type']})\n")
    
    if sig1['type'] == 'image' and sig2['type'] == 'image':
        img1 = Image.open(path1)
        img2 = Image.open(path2)
        
        fig, axes = plt.subplots(1, 2, figsize=(8, 4))
        axes[0].imshow(img1)
        axes[0].set_title(os.path.basename(path1))
        axes[0].axis('off')
        
        axes[1].imshow(img2)
        axes[1].set_title(os.path.basename(path2))
        axes[1].axis('off')
        plt.show()
    
    elif sig1['type'] == 'video' and sig2['type'] == 'video':
        frame1 = get_video_frame(path1, 0.5)
        frame2 = get_video_frame(path2, 0.5)
        
        fig, axes = plt.subplots(1, 2, figsize=(8, 4))
        if frame1 is not None:
            axes[0].imshow(cv2.cvtColor(frame1, cv2.COLOR_BGR2RGB))
        axes[0].set_title(os.path.basename(path1))
        axes[0].axis('off')
        
        if frame2 is not None:
            axes[1].imshow(cv2.cvtColor(frame2, cv2.COLOR_BGR2RGB))
        axes[1].set_title(os.path.basename(path2))
        axes[1].axis('off')
        plt.show()

def detect_and_move_duplicates(folder_path, check_threshold=95.0, delete_threshold=99.0):
    """
    - Scans the folder for images & videos, building signatures
    - Pairwise compares them (O(n^2))
    - If similarity >= delete_threshold => auto-move to 'duplicated_photos'
    - If check_threshold <= similarity < delete_threshold => show user & ask
      (options: y => move one file, n => keep, 1 => stop)
    - If < check_threshold => skip
    - Stats at the end
    """
    all_files = load_files(folder_path)
    total_files = len(all_files)
    print("1/2")
    # Build signatures
    signatures = {}
    for f in tqdm(all_files):
        signatures[f] = compute_file_signature(f)
    
    # Stats
    stats = {
        'total_files': total_files,
        'pairs_compared': 0,
        'duplicates_auto_moved': 0,
        'duplicates_user_moved': 0,
        'pairs_prompted': 0,
        'pairs_skipped': 0,
        'pairs_under_threshold': 0
    }
    
    file_list = list(signatures.keys())
    checked_pairs = set()
    user_stopped = False
    
    # We'll move duplicates to a subfolder "duplicated_photos" inside folder_path
    duplicates_folder = os.path.join(folder_path, "duplicated_photos")
    print("2/2")
    for i in tqdm(range(len(file_list))):
        if user_stopped:
            break
        for j in range(i+1, len(file_list)):
            if user_stopped:
                break
            
            path1 = file_list[i]
            path2 = file_list[j]
            
            if (path1, path2) in checked_pairs or (path2, path1) in checked_pairs:
                continue
            checked_pairs.add((path1, path2))
            
            # If one was already moved (i.e. not in signatures anymore), skip
            if path1 not in signatures or path2 not in signatures:
                continue
            
            sig1 = signatures[path1]
            sig2 = signatures[path2]
            stats['pairs_compared'] += 1
            
            if not sig1 or not sig2:
                continue
            
            similarity = get_similarity(sig1, sig2, path1, path2, check_threshold)
            
            if similarity >= delete_threshold:
                # auto-move
                to_move = pick_lower_quality_or_tiebreak(path1, path2, sig1, sig2)
                print(f"[AUTO] {similarity:.2f}% => Moving to duplicates folder: {to_move}")
                move_file_to_duplicates(to_move, duplicates_folder)
                signatures.pop(to_move, None)
                stats['duplicates_auto_moved'] += 1
            
            elif similarity >= check_threshold:
                # prompt user
                stats['pairs_prompted'] += 1
                show_files_side_by_side(path1, path2, sig1, sig2, similarity)
                
                print("Options: [y] move one file, [n] keep both, [1] stop now")
                choice = input("Choice: ").strip().lower()
                
                if choice == 'y':
                    to_move = pick_lower_quality_or_tiebreak(path1, path2, sig1, sig2)
                    print(f"[USER] {similarity:.2f}% => Moving: {to_move}")
                    move_file_to_duplicates(to_move, duplicates_folder)
                    signatures.pop(to_move, None)
                    stats['duplicates_user_moved'] += 1
                elif choice == 'n':
                    print("[SKIPPED] Kept both.\n")
                    stats['pairs_skipped'] += 1
                elif choice == '1':
                    print("Stopping per user request...")
                    user_stopped = True
                    break
                else:
                    # treat as 'n'
                    print("[SKIPPED] Kept both.\n")
                    stats['pairs_skipped'] += 1
            
            else:
                stats['pairs_under_threshold'] += 1
                # skip
    
    total_moved = stats['duplicates_auto_moved'] + stats['duplicates_user_moved']
    files_remaining = total_files - total_moved
    duplicates_found = total_moved + stats['pairs_skipped']
    
    print("\n===== RUN SUMMARY =====")
    print(f"Total files scanned: {stats['total_files']}")
    print(f"Total pairs compared: {stats['pairs_compared']}")
    print(f"Pairs < {check_threshold}% similarity: {stats['pairs_under_threshold']}")
    print(f"Pairs prompted (≥ {check_threshold}% & < {delete_threshold}%): {stats['pairs_prompted']}")
    print(f"  - Duplicates user moved: {stats['duplicates_user_moved']}")
    print(f"  - Pairs user skipped (kept both): {stats['pairs_skipped']}")
    print(f"Pairs auto-moved (≥ {delete_threshold}%): {stats['duplicates_auto_moved']}")
    print(f"Total duplicates found (any ≥ {check_threshold}%): {duplicates_found}")
    print(f"Total files moved: {total_moved}")
    print(f"Files still remaining in original location: {files_remaining}")
    if user_stopped:
        print("User stopped before checking all pairs.")
    else:
        print("Completed all comparisons.")



In [None]:
def main():
    """
    Example usage with user input for folder path.
    """
    #folder_path = input("Put the folder directory here: ")
    folder_path = input(output_folder)
    detect_and_move_duplicates(folder_path, check_threshold=96.86, delete_threshold=96.87)

main()  # Uncomment to run if you're in a normal Python environment
