In [None]:
# %%
import google.generativeai as genai
import time
import threading
import json
import os
from pathlib import Path
from tqdm import tqdm
import tempfile
import chardet
import zipfile
import shutil
import concurrent.futures
import re
import openai

# %%
# Initialize the thread lock
lock = threading.Lock()

# %%
# Configuration for Gemini
gemini_model = None
def run_gemini_query(prompt, history, completion_tokens, temp, model):
    global gemini_model
    try:
        if not gemini_model:
            genai.configure(api_key="")            
            gemini_model = genai.GenerativeModel(model)

        safety_settings = [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        ]

        response = gemini_model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                max_output_tokens=completion_tokens,
                temperature=temp
            ),
            safety_settings=safety_settings
        )

        input_toks = response.usage_metadata.prompt_token_count
        output_toks = response.usage_metadata.candidates_token_count
        cost_usd = input_toks / 1_000_000 * 1.25 + output_toks / 1_000_000 * 5

        # Update costs.json with locking
        with lock:
            if os.path.exists('gemini_costs.json'):
                with open('gemini_costs.json', 'r') as f:
                    costs = json.load(f)
            else:
                costs = {}
            costs[model] = costs.get(model, 0) + cost_usd
            with open('gemini_costs.json', 'w') as f:
                json.dump(costs, f)

        inference = response.text.strip() if response.text else None

        if inference:
            return inference
        else:
            print('Error: message is empty')
            time.sleep(5)

    except Exception as e:
        print("Request failed.")
        print(e)
        time.sleep(5)

    return None

# %%
# Configuration for OpenAI
api_key = ""
openai_client = openai.OpenAI(api_key=api_key)

def run_openai_query(prompt, history, completion_tokens, temp, model, openai_client):
    response = None
    try:
        messages = history + [{"role": "user", "content": prompt}]
        
        response = openai_client.chat.completions.create(
            model=model,
            temperature=temp,
            max_tokens=completion_tokens,
            messages=messages,
        )
        content = response.choices[0].message.content.strip()
            
        if content:
            return content
        else:
            print('Error: message is empty')
            time.sleep(5)

    except Exception as e:
        print("Request failed.")
        print(e)
        time.sleep(5)

    return None

# %%
# Function to parse scenes from text
def parse_scenes_from_text(text: str):
    try:
        scene_pattern = re.compile(r'## SCENE \d+:\n(.+?)(?=(?:## SCENE \d+:|$))', re.DOTALL)
        scenes = scene_pattern.findall(text)
        return [scene.strip() for scene in scenes] if scenes else []
    except Exception as e:
        return []

# %%
# Selection prompt template
selection_prompt = """
[TEXT START]
<TEXT>
[TEXT END]

Your task is to examine the above **public domain** work for evocative, interesting, well-written scenes.

Instructions:

- Choose 3 long scenes of approx 1000-1500 words, and output them in their entirety.
- You must also repair paragraphs by restoring sentences within a paragraph that are broken by a newline.
- Double newline between paragraphs.
- Do not include chapter names.

Output in this format:

## THOUGHT PROCESS:

<thought process for the scene selection>

## SCENE 1:

<the text of approx 1000-1500 words>

## SCENE 2:

<the text of approx 1000-1500 words>

## SCENE 3:

<the text of approx 1000-1500 words>


--

Output precisely in this format. Do not add any additional commentary or explanations.
"""

# %%
# Function to detect file encoding
def detect_encoding(file_path: Path):
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read(100000)  # Read first 100KB for detection
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        if encoding:
            encoding_lower = encoding.lower()
            if encoding_lower in ['ascii', 'charmap']:
                return 'latin-1'
            if confidence >= 0.5:
                return encoding
        return 'utf-8'  # Fallback encoding
    except Exception as e:
        print(f"Error detecting encoding for '{file_path}': {e}. Falling back to 'utf-8'.")
        return 'utf-8'  # Fallback encoding

# %%
# Worker function to process a single file
def process_file(fn, base_path, results_path, selection_prompt):
    # Check if file already processed
    with lock:
        if os.path.exists(results_path):
            with open(results_path, 'r') as f:
                scenes = json.load(f)
        else:
            scenes = {}
        if fn in scenes:
            print(f"Skipping '{fn}': already processed.")
            return

    # Check if file exists
    zip_path = os.path.join(base_path, fn)
    if not os.path.exists(zip_path):
        print(f"Skipping '{fn}': file does not exist.")
        return
    
    with tempfile.TemporaryDirectory() as tmpdirname:
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(tmpdirname)
        except zipfile.BadZipFile:
            print(f"Skipping '{zip_path}': Not a valid zip file.")
            return
        except Exception as e:
            print(f"Error extracting '{zip_path}': {e}")
            return

        extracted_files = list(Path(tmpdirname).rglob("*"))
        text_files = [f for f in extracted_files if f.is_file() and f.suffix.lower() in ['.txt', '.md', '.text']]

        if len(text_files) != 1:
            print(f"Skipping '{zip_path}': Expected 1 text file, found {len(text_files)}.")
            return

        text_file = text_files[0]

        # Detect encoding
        encoding = detect_encoding(text_file)
        try:
            with open(text_file, 'r', encoding=encoding, errors='replace') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading '{text_file}': {e}")
            return

        if len(content) < 10000:
            print(f"Skipping '{fn}': content too small!")
            return
        
        if content.find('End of the Project Gutenberg'):
            content = content[:content.find('End of the Project Gutenberg')].strip()
        else:
            print('!! not found')

        max_seg_length = 400000
        # Split into 4 even segments, ensuring max length
        #segment_length = max_seg_length
        #segments = [content[i:i+segment_length] for i in range(0, len(content), segment_length)]
        #segments = segments[:4]  # Limit to first 4 segments
        segments = [content[i:i+len(content)//4][:max_seg_length] for i in range(0, len(content), len(content)//4)]
        segments = segments[:4]

        # Collect scenes from all segments
        all_scenes = []
        for i, segment in enumerate(segments):
            prompt = selection_prompt
            if i == 0:
                prompt += "\nThe first scene you select should be the very start of the book. The other two can be selected from anywhere."

            prompt = prompt.replace('<TEXT>', segment)

            result = run_gemini_query(prompt, [], 16000, 0, "gemini-1.5-pro-002")

            #print('-'*10)
            #print(result)
            #print('-'*10)

            if result:
                this_scenes = parse_scenes_from_text(result)
                print(len(this_scenes))
                #if not scenes:
                #    print('!!')

                all_scenes.extend(this_scenes)
            else:
                print(f"Failed to get scenes for '{fn}' segment {i+1}.")

            

        # Update scenes.json with locking
        with lock:
            if os.path.exists(results_path):
                with open(results_path, 'r') as f:
                    scenes = json.load(f)
            else:
                scenes = {}
            if fn not in scenes:
                scenes[fn] = []
            scenes[fn].extend(all_scenes)
            with open(results_path, 'w') as f:
                json.dump(scenes, f, indent=2)


In [5]:
import json

# %%
# Main function to orchestrate multithreading
def main():
    #filenames_path = 'data/fiction_book_selections.json'
    #filenames_path = 'data/fantasy_book_selections.json'
    #filenames_path = 'data/scifi_book_selections.json'
    filenames_path = 'data/romance_book_selections.json'
    base_path = "/mnt/i/gutenberg_processed/"
    #results_path = "scenes.json"
    #results_path = "scenes_fantasy.json"
    #results_path = "scenes_scifi.json"
    results_path = "scenes_romance.json"
    num_threads = 8

    # Load filenames
    with open(filenames_path, 'r') as f:
        filenames = json.load(f)

    #print(len(filenames))
    
    with open('data/fiction_book_selections.json', 'r') as f:
        already_selected = json.load(f)

    

    filenames = [fn for fn in filenames if fn not in already_selected] 
    #print(len(filenames))

    with open('data/scifi_book_selections.json', 'r') as f:
        already_selected = json.load(f)

    

    filenames = [fn for fn in filenames if fn not in already_selected] 
    #print(len(filenames))

    

    # Initialize tqdm
    progress = tqdm(total=len(filenames), desc="Processing files")

    # Define a helper to update tqdm
    def update_progress(future):
        progress.update(1)
        exception = future.exception()
        if exception:
            print(f"Error occurred: {exception}")

    # Use ThreadPoolExecutor for multithreading
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for fn in filenames:
            future = executor.submit(process_file, fn, base_path, results_path, selection_prompt)
            future.add_done_callback(update_progress)
            futures.append(future)

        # Wait for all futures to complete
        concurrent.futures.wait(futures)

    progress.close()
    print("Processing completed.")


In [6]:
main()

Processing files:   0%|          | 0/57 [00:00<?, ?it/s]

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


Processing files:   2%|▏         | 1/57 [07:17<6:48:38, 437.83s/it]

3
3
3
3
3
3
3


Processing files:   4%|▎         | 2/57 [09:28<3:55:33, 256.97s/it]

3
3
3


Processing files:   5%|▌         | 3/57 [10:17<2:25:51, 162.06s/it]

3
3


Processing files:   7%|▋         | 4/57 [10:48<1:37:25, 110.29s/it]

3
3
3


Processing files:   9%|▉         | 5/57 [11:52<1:21:08, 93.62s/it] 

3


Processing files:  11%|█         | 6/57 [12:20<1:00:46, 71.50s/it]

3


Processing files:  12%|█▏        | 7/57 [12:40<45:34, 54.70s/it]  

3
3
3
3


Processing files:  14%|█▍        | 8/57 [14:31<59:10, 72.46s/it]

3
3
3
3
3
3
3


Processing files:  16%|█▌        | 9/57 [16:42<1:12:43, 90.90s/it]

3


Processing files:  18%|█▊        | 10/57 [17:12<56:17, 71.87s/it] 

3
3
3
3
3
3
3
3
3
3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.


Processing files:  19%|█▉        | 11/57 [20:53<1:30:05, 117.52s/it]

Failed to get scenes for 'Mary_Johnston -- To_Have_and_To_Hold.zip' segment 4.
3


Processing files:  21%|██        | 12/57 [21:49<1:14:05, 98.79s/it] 

3


Processing files:  23%|██▎       | 13/57 [22:02<53:23, 72.81s/it]  

3


Processing files:  25%|██▍       | 14/57 [22:16<39:33, 55.19s/it]

3
3
3
3
3
3


Processing files:  26%|██▋       | 15/57 [25:12<1:04:05, 91.56s/it]

3


Processing files:  28%|██▊       | 16/57 [25:17<44:44, 65.47s/it]  

3
3
3


Processing files:  30%|██▉       | 17/57 [25:38<34:41, 52.04s/it]

3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- Life_and_Gabriella.zip' segment 1.
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- Virginia.zip' segment 1.
3
3
3


Processing files:  32%|███▏      | 18/57 [28:44<1:00:06, 92.47s/it]

3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- Life_and_Gabriella.zip' segment 2.
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- The_Romance_of_a_Plain_Man.zip' segment 1.
3
3


Processing files:  33%|███▎      | 19/57 [31:16<1:09:47, 110.19s/it]

3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- Virginia.zip' segment 3.
3


Processing files:  35%|███▌      | 20/57 [32:50<1:04:55, 105.29s/it]

3


Processing files:  37%|███▋      | 21/57 [33:40<53:21, 88.92s/it]   

3


Processing files:  39%|███▊      | 22/57 [33:48<37:38, 64.53s/it]

3
3


Processing files:  40%|████      | 23/57 [34:05<28:24, 50.14s/it]

3


Processing files:  42%|████▏     | 24/57 [34:52<27:03, 49.18s/it]

3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Ellen_Glasgow -- The_Voice_of_the_People.zip' segment 1.
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
3
Failed to get scenes for 'Ellen_Glasgow -- The_Romance_of_a_Plain_Man.zip' segment 3.
3
3


Processing files:  44%|████▍     | 25/57 [38:14<50:42, 95.07s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Mrs_E_D_E_N_Southworth -- For_Woman's_Love.zip' segment 2.
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
3


Processing files:  46%|████▌     | 26/57 [38:39<38:21, 74.25s/it]

Failed to get scenes for 'Ellen_Glasgow -- The_Romance_of_a_Plain_Man.zip' segment 4.
3
3
3
3
3
3
3
3


Processing files:  47%|████▋     | 27/57 [41:43<53:36, 107.21s/it]

3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.


Processing files:  49%|████▉     | 28/57 [42:28<42:45, 88.46s/it] 

Failed to get scenes for 'Ellen_Glasgow -- The_Voice_of_the_People.zip' segment 4.


Processing files:  51%|█████     | 29/57 [42:38<30:12, 64.74s/it]

3


Processing files:  53%|█████▎    | 30/57 [42:59<23:15, 51.67s/it]

3
3


Processing files:  54%|█████▍    | 31/57 [43:23<18:51, 43.51s/it]

3
3
3
3
3
3
3
3
3


Processing files:  56%|█████▌    | 32/57 [46:13<33:56, 81.44s/it]

3
3
3
3


Processing files:  58%|█████▊    | 33/57 [48:08<36:35, 91.46s/it]

3
3
3
3
3
3
3
3
3


Processing files:  60%|█████▉    | 34/57 [50:16<39:16, 102.46s/it]

3
3
3
3


Processing files:  61%|██████▏   | 35/57 [51:42<35:46, 97.58s/it] 

3


Processing files:  63%|██████▎   | 36/57 [51:45<24:13, 69.20s/it]

3
3
3


Processing files:  65%|██████▍   | 37/57 [52:47<22:18, 66.93s/it]

3


Processing files:  67%|██████▋   | 38/57 [53:21<18:02, 56.95s/it]

3
3
3


Processing files:  68%|██████▊   | 39/57 [53:46<14:15, 47.54s/it]

3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Gertrude_Atherton -- Senator_North.zip' segment 1.
3
3
3
3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.


Processing files:  70%|███████   | 40/57 [57:06<26:26, 93.34s/it]

3
Failed to get scenes for 'Gertrude_Atherton -- Senator_North.zip' segment 2.


Processing files:  72%|███████▏  | 41/57 [57:39<20:04, 75.27s/it]

3
3
3


Processing files:  74%|███████▎  | 42/57 [58:38<17:32, 70.14s/it]

3
3
3
3
3
Request failed.
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 7.
Failed to get scenes for 'Gertrude_Atherton -- Senator_North.zip' segment 3.


Processing files:  75%|███████▌  | 43/57 [59:45<16:08, 69.21s/it]

3
3
3


Processing files:  77%|███████▋  | 44/57 [1:01:10<16:04, 74.18s/it]

3
3
3


Processing files:  79%|███████▉  | 45/57 [1:01:28<11:25, 57.13s/it]

3
3
3
3


Processing files:  81%|████████  | 46/57 [1:03:32<14:10, 77.31s/it]

3
3


Processing files:  82%|████████▏ | 47/57 [1:03:38<09:19, 55.97s/it]

3
3
3
3
3
3
3
3


Processing files:  84%|████████▍ | 48/57 [1:06:21<13:12, 88.04s/it]

3
3
3


Processing files:  86%|████████▌ | 49/57 [1:06:37<08:50, 66.28s/it]

3
3
3
3
3


Processing files:  88%|████████▊ | 50/57 [1:09:13<10:52, 93.28s/it]

3
3
3
Request failed.
Invalid operation: The `response.parts` quick accessor requires a single candidate, but but `response.candidates` is empty.
This appears to be caused by a blocked prompt, see `response.prompt_feedback`: block_reason: PROHIBITED_CONTENT

Failed to get scenes for 'Sarah_Orne_Jewett -- Betty_Leicester.zip' segment 2.
3
3


Processing files:  89%|████████▉ | 51/57 [1:11:19<10:19, 103.17s/it]

3


Processing files:  91%|█████████ | 52/57 [1:11:20<06:01, 72.37s/it] 

3
3
3


Processing files:  93%|█████████▎| 53/57 [1:12:30<04:47, 71.75s/it]

3
3
3


Processing files:  95%|█████████▍| 54/57 [1:14:27<04:15, 85.16s/it]

3
3


Processing files:  96%|█████████▋| 55/57 [1:15:37<02:41, 80.65s/it]

3


Processing files:  98%|█████████▊| 56/57 [1:15:41<00:57, 57.70s/it]

3


Processing files: 100%|██████████| 57/57 [1:16:55<00:00, 80.97s/it]

3
Processing completed.



