In [None]:
# %%
import google.generativeai as genai
import time
import threading
import json
import os
from pathlib import Path
from tqdm import tqdm
import tempfile
import chardet
import zipfile
import shutil
import concurrent.futures
import re
import openai
import requests

# %%
# Initialize the thread lock
lock = threading.Lock()

# %%
# Configuration for Gemini
gemini_model = None
def run_gemini_query(prompt, history, completion_tokens, temp, model):
    global gemini_model

    tries = 0
    while tries < 30:
        tries += 1
        try:
            if not gemini_model:
                genai.configure(api_key="")            
                gemini_model = genai.GenerativeModel(model)

            safety_settings = [
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
            ]

            response = gemini_model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    candidate_count=1,
                    max_output_tokens=completion_tokens,
                    temperature=temp
                ),
                safety_settings=safety_settings
            )

            input_toks = response.usage_metadata.prompt_token_count
            output_toks = response.usage_metadata.candidates_token_count
            cost_usd = input_toks / 1_000_000 * 1.25 + output_toks / 1_000_000 * 5

            # Update costs.json with locking
            with lock:
                if os.path.exists('gemini_costs.json'):
                    with open('gemini_costs.json', 'r') as f:
                        costs = json.load(f)
                else:
                    costs = {}
                costs[model] = costs.get(model, 0) + cost_usd
                with open('gemini_costs.json', 'w') as f:
                    json.dump(costs, f)

            inference = response.text.strip() if response.text else None

            if inference:
                return inference
            else:
                print('Error: message is empty')
                time.sleep(5)

        except Exception as e:
            print("Request failed.")
            print(e)
            if str(e) != '429 Resource has been exhausted (e.g. check quota).':
                return None
            print('retrying', tries)
            time.sleep(20)

    return None


# %%
# Function to parse scenes from text
def parse_scenes_from_text(text: str):
    try:
        scene_pattern = re.compile(r'## SCENE \d+:\n(.+?)(?=(?:## SCENE \d+:|$))', re.DOTALL)
        scenes = scene_pattern.findall(text)
        return [scene.strip() for scene in scenes] if scenes else []
    except Exception as e:
        return []

# %%
# Selection prompt template
selection_prompt = """
[TEXT START]
<TEXT>
[TEXT END]

Your task is to examine the above **public domain** work for evocative, interesting, well-written scenes.

Instructions:

- Choose 3 long scenes of approx 1000-1500 words, and output them in their entirety.
- You must also repair paragraphs by restoring sentences within a paragraph that are broken by a newline.
- Double newline between paragraphs.
- Do not include chapter names.

Output in this format:

## THOUGHT PROCESS:

<thought process for the scene selection>

## SCENE 1:

<the text of approx 1000-1500 words>

## SCENE 2:

<the text of approx 1000-1500 words>

## SCENE 3:

<the text of approx 1000-1500 words>


--

Output precisely in this format. Do not add any additional commentary or explanations.
"""

# %%
# Function to detect file encoding
def detect_encoding(file_path: Path):
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read(100000)  # Read first 100KB for detection
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        if encoding:
            encoding_lower = encoding.lower()
            if encoding_lower in ['ascii', 'charmap']:
                return 'latin-1'
            if confidence >= 0.5:
                return encoding
        return 'utf-8'  # Fallback encoding
    except Exception as e:
        print(f"Error detecting encoding for '{file_path}': {e}. Falling back to 'utf-8'.")
        return 'utf-8'  # Fallback encoding

# %%
# Worker function to process a single file
def process_file(key, selection, results_path, selection_prompt):
    # Check if file already processed
    with lock:
        if os.path.exists(results_path):
            with open(results_path, 'r') as f:
                scenes = json.load(f)
        else:
            scenes = {}
        if key in scenes:
            print(f"Skipping '{key}': already processed.")
            return

    response = requests.get(selection["text_url"])

    content = response.text

    if len(content) < 10000:
        print(f"Skipping '{key}': content too small!")
        return
    
    if content.find('End of the Project Gutenberg'):
        content = content[:content.find('End of the Project Gutenberg')].strip()
    else:
        print('!! not found')

    max_seg_length = 400000
    # Split into 4 even segments, ensuring max length
    #segment_length = max_seg_length
    #segments = [content[i:i+segment_length] for i in range(0, len(content), segment_length)]
    #segments = segments[:4]  # Limit to first 4 segments
    segments = [content[i:i+len(content)//4][:max_seg_length] for i in range(0, len(content), len(content)//4)]
    segments = segments[:4]

    # Collect scenes from all segments
    all_scenes = []
    for i, segment in enumerate(segments):
        prompt = selection_prompt
        if i == 0:
            prompt += "\nThe first scene you select should be the very start of the book. The other two can be selected from anywhere."

        prompt = prompt.replace('<TEXT>', segment)

        result = run_gemini_query(prompt, [], 16000, 0, "gemini-1.5-pro-002")

        #print('-'*10)
        #print(result)
        #print('-'*10)

        if result:
            this_scenes = parse_scenes_from_text(result)
            print(len(this_scenes))
            #if not scenes:
            #    print('!!')

            all_scenes.extend(this_scenes)
        else:
            print(f"Failed to get scenes for '{key}' segment {i+1}.")

        

    # Update scenes.json with locking
    with lock:
        if os.path.exists(results_path):
            with open(results_path, 'r') as f:
                scenes = json.load(f)
        else:
            scenes = {}
        if key not in scenes:
            scenes[key] = []
        scenes[key].extend(all_scenes)
        with open(results_path, 'w') as f:
            json.dump(scenes, f, indent=2)


In [8]:
import json

# %%
# Main function to orchestrate multithreading
def main():
    #filenames_path = 'data/fiction_book_selections.json'
    #filenames_path = 'data/fantasy_book_selections.json'
    #filenames_path = 'data/scifi_book_selections.json'
    selections_path = 'data/site/site_fiction_final_selections.json'

    #results_path = "scenes.json"
    #results_path = "scenes_fantasy.json"
    #results_path = "scenes_scifi.json"
    results_path = "data/site/processing/site_scenes_fiction.json"
    num_threads = 32

    # Load filenames
    with open(selections_path, 'r') as f:
        selections = json.load(f)

    #print(len(filenames))
    
    if True:
        with open('data/site/site_fantasy_final_selections.json', 'r') as f:
            already_selected = json.load(f)

        for k in already_selected:
            try:
                del selections[k]
            except:
                pass


    if True:
        with open('data/site/site_scifi_final_selections.json', 'r') as f:
            already_selected = json.load(f)

        for k in already_selected:
            try:
                del selections[k]
            except:
                pass

    if True:
        with open('data/site/site_adventure_final_selections.json', 'r') as f:
            already_selected = json.load(f)

        for k in already_selected:
            try:
                del selections[k]
            except:
                pass

    

    # Initialize tqdm
    progress = tqdm(total=len(selections), desc="Processing files")

    # Define a helper to update tqdm
    def update_progress(future):
        progress.update(1)
        exception = future.exception()
        if exception:
            print(f"Error occurred: {exception}")

    # Use ThreadPoolExecutor for multithreading
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for k,v in selections.items():
            future = executor.submit(process_file, k, v, results_path, selection_prompt)
            future.add_done_callback(update_progress)
            futures.append(future)
            time.sleep(5)

        # Wait for all futures to complete
        concurrent.futures.wait(futures)

    progress.close()
    print("Processing completed.")


In [9]:
main()

Processing files:   0%|          | 0/64 [00:00<?, ?it/s]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Mary Wollstonecraft Shelley -- Frankenstein; Or, The Modern Prometheus' segment 1.
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Charlotte Perkins Gilman -- The Yellow Wallpaper' segment 1.
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google

Processing files:   2%|▏         | 1/64 [07:56<8:20:43, 476.89s/it]

3
3


Processing files:   3%|▎         | 2/64 [08:10<3:31:11, 204.38s/it]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Washington Irving -- The Legend of Sleepy Hollow' segment 4.
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Charles Dickens -- A Christmas Carol in Prose; Being a Ghost Story of Christmas' segment 2.
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai

Processing files:   5%|▍         | 3/64 [09:25<2:27:30, 145.09s/it]

3
3
3
3
3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'F. Scott Fitzgerald -- The Great Gatsby' segment 3.
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Herman Melville -- Moby Dick; Or, The Whale' segment 3.


Processing files:   6%|▋         | 4/64 [10:31<1:54:12, 114.20s/it]

3
3
3
3
3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Mark Twain -- Adventures of Huckleberry Finn' segment 2.
3
3
3


Processing files:   8%|▊         | 5/64 [11:25<1:30:57, 92.49s/it] 

3
3
3
3
3
3
3


Processing files:   9%|▉         | 6/64 [12:30<1:20:18, 83.08s/it]

3


Processing files:  11%|█         | 7/64 [12:36<54:46, 57.67s/it]  

3
3


Processing files:  12%|█▎        | 8/64 [12:44<39:19, 42.14s/it]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Joseph Conrad -- Heart of Darkness' segment 4.
3


Processing files:  14%|█▍        | 9/64 [12:46<26:57, 29.42s/it]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Franz Kafka -- Metamorphosis' segment 4.


Processing files:  16%|█▌        | 10/64 [12:56<21:08, 23.49s/it]

3


Processing files:  17%|█▋        | 11/64 [12:59<15:10, 17.18s/it]

3
3


Processing files:  19%|█▉        | 12/64 [13:02<11:14, 12.97s/it]

3


Processing files:  20%|██        | 13/64 [13:13<10:26, 12.27s/it]

3
3
3
3
3


Processing files:  22%|██▏       | 14/64 [14:00<18:59, 22.79s/it]

3
3
3


Processing files:  23%|██▎       | 15/64 [14:08<14:51, 18.20s/it]

3
3


Processing files:  25%|██▌       | 16/64 [14:36<17:03, 21.32s/it]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Mark Twain -- The Adventures of Tom Sawyer, Complete' segment 4.
2


Processing files:  27%|██▋       | 17/64 [14:43<13:17, 16.98s/it]

3
3
3


Processing files:  28%|██▊       | 18/64 [15:03<13:39, 17.81s/it]

3


Processing files:  30%|██▉       | 19/64 [15:07<10:18, 13.75s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'H. G. Wells -- The Red Room' segment 1.


Processing files:  31%|███▏      | 20/64 [15:37<13:37, 18.58s/it]

3


Processing files:  33%|███▎      | 21/64 [15:44<10:44, 14.99s/it]

3
3


Processing files:  34%|███▍      | 22/64 [15:54<09:33, 13.65s/it]

Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Mark Twain -- Adventures of Huckleberry Finn' segment 4.
3
3
3


Processing files:  36%|███▌      | 23/64 [16:17<11:12, 16.41s/it]

3
3
3


Processing files:  38%|███▊      | 24/64 [16:37<11:41, 17.54s/it]

3
3
3


Processing files:  39%|███▉      | 25/64 [16:46<09:44, 14.98s/it]

3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Edgar Allan Poe -- The Fall of the House of Usher' segment 1.


Processing files:  41%|████      | 26/64 [17:10<11:07, 17.57s/it]

3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'W. W. Jacobs -- The Monkey's Paw' segment 1.
3
3
3
3


Processing files:  42%|████▏     | 27/64 [18:11<18:50, 30.57s/it]

3


Processing files:  44%|████▍     | 28/64 [18:20<14:29, 24.16s/it]

3
3


Processing files:  45%|████▌     | 29/64 [18:21<10:00, 17.16s/it]

3
3
3
3
3
3
3
3
3


Processing files:  47%|████▋     | 30/64 [18:43<10:37, 18.74s/it]

3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Edgar Allan Poe -- The Masque of the Red Death' segment 1.
3
2
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Edgar Allan Poe -- The Cask of Amontillado' segment 2.
3
3
3


Processing files:  48%|████▊     | 31/64 [20:02<20:11, 36.71s/it]

3


Processing files:  50%|█████     | 32/64 [20:09<14:56, 28.01s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ernest Hemingway -- The Sun Also Rises' segment 2.
3
3


Processing files:  52%|█████▏    | 33/64 [20:52<16:44, 32.39s/it]

3
3


Processing files:  53%|█████▎    | 34/64 [20:56<11:57, 23.92s/it]

3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Evgenii Ivanovich Zamiatin -- We' segment 1.
3
3
3
3


Processing files:  55%|█████▍    | 35/64 [21:50<15:54, 32.90s/it]

3
3
3
3


Processing files:  56%|█████▋    | 36/64 [22:25<15:42, 33.65s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'Edgar Allan Poe -- The Fall of the House of Usher' segment 2.
3


Processing files:  58%|█████▊    | 37/64 [22:44<13:05, 29.08s/it]

3


Processing files:  59%|█████▉    | 38/64 [22:52<09:48, 22.65s/it]

3


Processing files:  61%|██████    | 39/64 [23:00<07:43, 18.54s/it]

3


Processing files:  62%|██████▎   | 40/64 [23:04<05:37, 14.06s/it]

3
3
3
3


Processing files:  64%|██████▍   | 41/64 [23:21<05:40, 14.79s/it]

3
3
3
3
3
3


Processing files:  66%|██████▌   | 42/64 [23:41<06:03, 16.52s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Failed to get scenes for 'W. W. Jacobs -- The Monkey's Paw' segment 3.
3
3


Processing files:  67%|██████▋   | 43/64 [24:11<07:08, 20.43s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Algernon Blackwood -- The Wendigo' segment 1.
3
3


Processing files:  69%|██████▉   | 44/64 [25:05<10:10, 30.52s/it]

3
3


Processing files:  70%|███████   | 45/64 [25:09<07:11, 22.74s/it]

3
3
3
3


Processing files:  72%|███████▏  | 46/64 [25:46<08:03, 26.85s/it]

3


Processing files:  73%|███████▎  | 47/64 [25:55<06:04, 21.43s/it]

3
3


Processing files:  75%|███████▌  | 48/64 [26:06<04:54, 18.42s/it]

3


Processing files:  77%|███████▋  | 49/64 [26:17<04:01, 16.09s/it]

3
3
3
3
3
3


Processing files:  78%|███████▊  | 50/64 [26:54<05:13, 22.38s/it]

3
3


Processing files:  80%|███████▉  | 51/64 [27:34<06:00, 27.74s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Evgenii Ivanovich Zamiatin -- We' segment 3.
3


Processing files:  81%|████████▏ | 52/64 [28:05<05:44, 28.72s/it]

3


Processing files:  83%|████████▎ | 53/64 [28:15<04:13, 23.09s/it]

3
3
3
3


Processing files:  84%|████████▍ | 54/64 [29:06<05:15, 31.59s/it]

3
3


Processing files:  86%|████████▌ | 55/64 [30:04<05:54, 39.34s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'E. M. Forster -- A passage to India' segment 3.
3
3
3
3


Processing files:  88%|████████▊ | 56/64 [30:53<05:39, 42.40s/it]

3


Processing files:  89%|████████▉ | 57/64 [31:06<03:54, 33.46s/it]

3


Processing files:  91%|█████████ | 58/64 [31:07<02:22, 23.68s/it]

3


Processing files:  92%|█████████▏| 59/64 [31:44<02:18, 27.71s/it]

3


Processing files:  94%|█████████▍| 60/64 [32:11<01:49, 27.47s/it]

3


Processing files:  95%|█████████▌| 61/64 [32:17<01:03, 21.22s/it]

3


Processing files:  97%|█████████▋| 62/64 [32:42<00:44, 22.30s/it]

3


Processing files:  98%|█████████▊| 63/64 [32:57<00:20, 20.02s/it]

3


Processing files: 100%|██████████| 64/64 [33:10<00:00, 31.10s/it]

3
Processing completed.



