In [1]:
import json
import re

# Load the fantasy author selections file
with open('data/romance_author_selections_top_50.json', 'r') as f:
    fantasy_authors = json.load(f)

# Load the Gutenberg index file
with open('_gutenberg_index.json', 'r', encoding="utf-8") as f:
    gutenberg_index = json.load(f)

# Helper function to create a regex pattern from an author's name
def create_author_regex(author_name):
    # Handle initials like "C.S." to match "C. S.", "CS", etc.
    # For example, C.S. Lewis -> C.?S.?\s*Lewis
    parts = re.split(r'\s+', author_name)  # Split the name by spaces
    regex_parts = []
    
    for part in parts:
        if len(part) == 2 and part[1] == '.':  # Check for initials like "C."
            regex_parts.append(re.escape(part[0]) + r'\.?')  # Make "C." match "C" or "C."
        else:
            regex_parts.append(re.escape(part))  # Escape normal words
    
    return r'\s*'.join(regex_parts)  # Join parts with optional spaces

# Main function to find books by authors
def find_books_by_authors(fantasy_authors, gutenberg_index):
    result = {}

    for author_id, author_name in fantasy_authors.items():
        # Create a regex pattern for the author name
        author_pattern = create_author_regex(author_name)
        author_regex = re.compile(author_pattern, re.IGNORECASE)

        # Collect all matching books for the current author
        matching_books = []

        for book_key, book_info in gutenberg_index.items():
            if author_regex.search(book_info['author']):
                matching_books.append(book_info)
        
        # Store the result in the dict indexed by the exact author name
        if matching_books:
            result[author_name] = matching_books

    return result

# Find the books
author_books = find_books_by_authors(fantasy_authors, gutenberg_index)

# Print or save the results
with open("data/romance_author_selections_top_50_all_books.json", "w") as f:
    json.dump(author_books, f, indent=4)
