In [None]:
import os
import zipfile
import shutil
import json
import re
from pathlib import Path
from tqdm import tqdm
import tempfile
import chardet
from typing import Dict

# ----------------------------
# Function Definitions
# ----------------------------

def parse_gutenberg_header(header_string: str) -> Dict[str, str]:
    """
    Parses a Project Gutenberg header string to extract the title, author, and note.

    Parameters:
        header_string (str): The header string to parse.

    Returns:
        Dict[str, str]: A dictionary containing the title, author, and note.
    """
    # Initialize default values
    title = ""
    author = "Unknown author"
    note = ""

    # Step 1: Normalize the string by replacing line breaks with spaces
    normalized = ' '.join(header_string.splitlines())
    normalized = re.sub(r'\s+', ' ', normalized).strip()

    # Step 2: Remove known prefixes
    prefixes = [
        r'^The Project Gutenberg EBook of ',
        r'^The Project Gutenberg EBook ',
        r'^Project Gutenberg\'s ',
        r'^The Project Gutenberg Etext of ',
        r'^The Project Gutenberg Etext ',
        r'^Project Gutenberg Etext of',  
        r'^\*\*\*The Project Gutenberg Etext of ',
        r'^\*\*\*The Project Gutenberg eText of ',
        r'^\*\*\*The Project Gutenberg Etext of ',
        r'^\*\*The Project Gutenberg Etext of ',
        r'^The Project Gutenberg EBook: ',
        r'^Project Gutenberg Etext ',
        r'^Project Gutenberg ',
        r'^The Project Gutenberg eBook, ',

    ]
    prefix_removed = False
    for prefix in prefixes:
        match = re.match(prefix, normalized, re.IGNORECASE)
        if match:
            normalized = re.sub(prefix, '', normalized, flags=re.IGNORECASE)
            prefix_removed = True
            break  # Assume only one prefix applies

    if not prefix_removed:
        # If no known prefix is found, proceed without removal
        pass

    # Step 3: Extract note
    # 3a. Check for note in parentheses
    parenthetical_note = re.search(r'\(([^)]+)\)', normalized)
    if parenthetical_note:
        note = parenthetical_note.group(1).strip()
        # Remove the parenthetical note from normalized
        normalized = re.sub(r'\([^)]*\)', '', normalized).strip()

    # 3b. Check for note with #number in our series
    hash_note_match = re.search(r'#\d+\s+in our series(?: of [^,]+)?(?: by [^,]+)?', normalized, re.IGNORECASE)
    if hash_note_match:
        # If note was already captured in parentheses, append
        if note:
            note += ' ' + hash_note_match.group(0).strip()
        else:
            note = hash_note_match.group(0).strip()
        # Remove the hash note from normalized
        normalized = re.sub(r'#\d+\s+in our series(?: of [^,]+)?(?: by [^,]+)?', '', normalized).strip()

    # 3c. Check for other notes (e.g., "Plagiarized from ...")
    other_note_match = re.search(r'(Plagiarized from [^,]+(?:, [^,]+)*)', normalized, re.IGNORECASE)
    if other_note_match:
        if note:
            note += ' ' + other_note_match.group(1).strip()
        else:
            note = other_note_match.group(1).strip()
        # Remove the other note from normalized
        normalized = re.sub(r'Plagiarized from [^,]+(?:, [^,]+)*', '', normalized, flags=re.IGNORECASE).strip()

    # Step 4: Extract author
    # 4a. Look for ", by [author]"
    by_author_match = re.search(r',\s*by\s+([^,]+)', normalized, re.IGNORECASE)
    if by_author_match:
        author = by_author_match.group(1).strip()
        # Remove the author part from normalized
        normalized = re.sub(r',\s*by\s+[^,]+', '', normalized).strip()
    else:
        # 4b. Look for " by [author]" without preceding comma
        by_author_match = re.search(r'\sby\s+([^,]+)', normalized, re.IGNORECASE)
        if by_author_match:
            author = by_author_match.group(1).strip()
            # Remove the author part from normalized
            normalized = re.sub(r'\sby\s+[^,]+', '', normalized).strip()

    # Step 5: The remaining text is the title
    # Remove any trailing commas or version info
    title = normalized.strip().rstrip(',')

    # Final cleanup: Remove version info like ", v2" or ", v10"
    title = re.sub(r',\s*v\d+(\.\d+)?$', '', title, flags=re.IGNORECASE).strip()

    return {
        'title': title,
        'author': author,
        'note': note
    }

def find_zip_files(source_dir: Path):
    """
    Recursively find all zip files in the given source directory.

    Parameters:
        source_dir (Path): The source directory path.

    Returns:
        List[Path]: A list of paths to zip files.
    """
    zip_files = list(source_dir.rglob("*.zip"))
    return zip_files

def load_results(results_file: Path):
    """
    Load existing results from the JSON file if it exists.

    Parameters:
        results_file (Path): Path to the results JSON file.

    Returns:
        Dict: A dictionary of existing results.
    """
    if results_file.exists():
        try:
            with open(results_file, 'r', encoding='utf-8') as f:
                results = json.load(f)
            print(f"Loaded {len(results)} existing results.")
            return results
        except json.JSONDecodeError:
            print("Results file is corrupted. Starting fresh.")
            return {}
    else:
        print("No existing results found. Starting fresh.")
        return {}

def save_results(results: Dict, results_file: Path):
    """
    Save the results dictionary to the JSON file.

    Parameters:
        results (Dict): The results dictionary.
        results_file (Path): Path to the results JSON file.
    """
    try:
        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4, ensure_ascii=False)
        #print(f"Saved {len(results)} results to {results_file}.")
    except Exception as e:
        print(f"Error saving results to '{results_file}': {e}")

def handle_duplicate(target_dir: Path, base_name: str, extension: str = ".zip"):
    """
    Handle duplicate filenames by appending a counter.

    Parameters:
        target_dir (Path): The target directory.
        base_name (str): The base name for the new file.
        extension (str): The file extension.

    Returns:
        str: A unique filename with the extension.
    """
    candidate = target_dir / f"{base_name}{extension}"
    counter = 1
    while candidate.exists():
        candidate = target_dir / f"{base_name} ({counter}){extension}"
        counter += 1
    return candidate.name  # Return only the filename, not the path

def detect_encoding(file_path: Path):
    """
    Detect the encoding of a file using chardet.
    Prioritize 'utf-8' and 'latin-1' to handle a wide range of characters.
    Avoid using 'ascii' and 'charmap' as they are too limited.

    Parameters:
        file_path (Path): Path to the text file.

    Returns:
        str: Detected encoding or a reliable fallback.
    """
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read(100000)  # Read first 100KB for detection
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        if encoding:
            encoding_lower = encoding.lower()
            if encoding_lower in ['ascii', 'charmap']:
                #print(f"Detected encoding '{encoding}' with confidence {confidence:.2f} for '{file_path}'. Falling back to 'latin-1'.")
                return 'latin-1'
            if confidence >= 0.5:
                #print(f"Detected encoding '{encoding}' with confidence {confidence:.2f} for '{file_path}'.")
                return encoding
            else:
                #print(f"Low confidence ({confidence:.2f}) for encoding '{encoding}' in '{file_path}'. Falling back to 'utf-8'.")
                pass
        else:
            #print(f"No encoding detected for '{file_path}'. Falling back to 'utf-8'.")
            pass
        return 'utf-8'  # More reliable fallback
    except Exception as e:
        print(f"Error detecting encoding for '{file_path}': {e}. Falling back to 'utf-8'.")
        return 'utf-8'  # Fallback encoding

import re
import unicodedata

def sanitize_filename(name: str) -> str:
    """
    Remove or replace characters that are invalid in filenames.
    
    This function:
    1. Removes control characters
    2. Removes reserved characters (/, \, ?, %, *, :, |, ", <, >, .)
    3. Removes leading/trailing spaces and dots
    4. Replaces spaces with underscores
    5. Limits the filename length to 255 characters
    6. Ensures the filename is not empty
    
    Parameters:
        name (str): The original filename.
    
    Returns:
        str: Sanitized filename.
    """
    # Normalize unicode characters
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
    
    # Remove control characters
    name = ''.join(c for c in name if ord(c) > 31 and ord(c) != 127)
    
    # Remove reserved characters
    name = re.sub(r'[/\\?%*:|"<>.,;=]', '', name)
    
    # Replace spaces with underscores
    name = name.replace(' ', '_')
    
    # Remove leading/trailing spaces and dots
    name = name.strip('. ')
    
    # Limit length to 255 characters
    name = name[:255]
    
    # Ensure the filename is not empty
    if not name:
        name = "unnamed_file"
    
    return name

def process_zip_file(zip_path: Path, target_dir: Path, results: Dict, results_file: Path, save_every: int, save_counter: Dict):
    """
    Process a single zip file:
    - Extract it
    - If it contains a single text file, parse details using parse_gutenberg_header
    - Clean up extracted files
    - Copy and rename the zip file
    - Update results
    - Save results every 'save_every' items

    Parameters:
        zip_path (Path): Path to the zip file.
        target_dir (Path): Path to the target directory.
        results (Dict): The results dictionary.
        results_file (Path): Path to the results JSON file.
        save_every (int): Number of items after which to save results.
        save_counter (Dict): A dictionary to keep track of the number of items processed since last save.

    Returns:
        bool: True if processed successfully, False otherwise.
    """
    with tempfile.TemporaryDirectory() as tmpdirname:
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(tmpdirname)
        except zipfile.BadZipFile:
            print(f"Skipping '{zip_path}': Not a valid zip file.")
            return False
        except Exception as e:
            print(f"Error extracting '{zip_path}': {e}")
            return False

        extracted_files = list(Path(tmpdirname).rglob("*"))
        text_files = [f for f in extracted_files if f.is_file() and f.suffix.lower() in ['.txt', '.md', '.text']]

        if len(text_files) != 1:
            #print(f"Skipping '{zip_path}': Expected 1 text file, found {len(text_files)}.")
            return False  # Not a success

        text_file = text_files[0]

        # Detect encoding
        encoding = detect_encoding(text_file)
        try:
            with open(text_file, 'r', encoding=encoding, errors='replace') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading '{text_file}': {e}")
            return False

        to_check = content[:10000].split('\n')
        if 'Language: English' not in to_check:
            return False
        # Parse header using parse_gutenberg_header
        sections = content.split("\n\n", 1)        
        header_string = sections[0] if len(sections) > 0 else ""
        if header_string.strip() == '**This is a COPYRIGHTED Project Gutenberg Etext, Details Below**':
            #header_string = sections[1] if len(sections) > 1 else ""
            #print(sections[:100])
            return False
        details = parse_gutenberg_header(header_string)        

        if not details.get('title'):
            print(f"Skipping '{zip_path}': Title not found in details.")
            print(f"Extracted Top Section:\n{header_string}\n{'-'*40}")
            return False  # Not a success

        author = details.get('author', 'Unknown Author').strip()
        title = details.get('title', 'Untitled').strip()
        note = details.get('note', '').strip()

        # Sanitize author and title for filenames
        author_sanitized = sanitize_filename(author)
        title_sanitized = sanitize_filename(title)
        base_name = f"{author_sanitized} -- {title_sanitized}"
        new_filename = handle_duplicate(target_dir, base_name)
        new_zip_path = target_dir / new_filename

        try:
            # Copy and rename the zip file
            shutil.copy2(zip_path, new_zip_path)
            # Optional: Uncomment the line below to enable verbose copying logs
            # print(f"Copied and renamed '{zip_path.name}' to '{new_filename}'.")
        except Exception as e:
            print(f"Error copying '{zip_path}' to '{new_zip_path}': {e}")
            return False

        # Update results
        results[str(new_filename)] = {
            'title': title,
            'author': author,
            'note': note,
            'filename': new_filename
        }

        # Increment save counter and save if needed
        save_counter['count'] += 1
        if save_counter['count'] >= save_every:
            save_results(results, results_file)
            save_counter['count'] = 0  # Reset counter

        return True  # Success

def main():
    """
    Main function to organize book zip files.
    """
    # Configuration
    source_dir = Path("I:/gutenberg").resolve()
    target_dir = Path("I:/gutenberg_processed").resolve()
    results_file = Path("I:/gutenberg_processed/_gutenberg_index.json").resolve()
    save_every = 10  # Number of items to process before saving

    if not source_dir.is_dir():
        print(f"Source directory '{source_dir}' does not exist or is not a directory.")
        return

    target_dir.mkdir(parents=True, exist_ok=True)

    # Load existing results for resuming
    results = load_results(results_file)

    # Find all zip files
    zip_files = find_zip_files(source_dir)
    total_files = len(zip_files)
    print(f"Found {total_files} zip files in '{source_dir}'.")

    # Filter out already processed zip files
    zip_files_to_process = [zf for zf in zip_files if str(zf) not in results]
    remaining = len(zip_files_to_process)
    print(f"{remaining} zip files to process.")

    if remaining == 0:
        print("No new zip files to process.")
        return

    # Initialize progress bar
    with tqdm(total=remaining, desc="Processing Zip Files", unit="zip") as pbar:
        # Initialize a counter dictionary to keep track of items since last save
        save_counter = {'count': 0}
        for zip_path in zip_files_to_process:
            try:
                success = process_zip_file(zip_path, target_dir, results, results_file, save_every, save_counter)
            except Exception as e:
                print(f"Unexpected error processing '{zip_path}': {e}")
            pbar.update(1)

        # After processing all files, save any remaining results
        if save_counter['count'] > 0:
            save_results(results, results_file)

    print("All zip files processed.")

# ----------------------------
# Entry Point
# ----------------------------

main()
