In [1]:
import pandas as pd
import requests
import os 
import ast
import time
from time import sleep
import logging
from pydantic import RootModel, conint
from typing import Tuple

In [3]:
system_prompt = """
    You are a helpful literary assistant, helping me with my master thesis on gender in literature. 
    Your job is to search the internet and assess whether a book features a **female major character** (fmc).
    
   You MUST ALWAYS return your response as a Python list with exactly two elements: [fmc_present, fmc_name]
    - `fmc_present` = 1 if yes, 0 if no
    - `fmc_name` = the most important female major character's name, or an empty string.

    Never return anything else. No explanations, no full sentences. Just the list.

    Examples:
    - For "To Kill A Mockingbird" by Harper Lee (has a fmc): [1, 'Scout Finch']
    - For "The Hobbit and The Lord of the Rings" by J.R.R. Tolkien (doesn't have a fmc): [0, '']

    The female major character does not need to be the protagonist, but she must be a central presence in the narrative, 
    with **more significance than a secondary or background character**.

    A female character is considered a **major character** if she plays a substantial role in the story, 
    regularly appears throughout the book, and if her role significantly contributes to relationships or events that 
    shape the plot or other characters.

    Characters who serve only as symbolic figures, plot devices, or one-time motivators without meaningful presence 
    or development should not be considered major characters.

    Examples of female major characters:
    - **Katniss Everdeen** in **The Hunger Games** – protagonist, central to the plot, her choices drive the rebellion and story progression.
    – **Hermione Granger** in *Harry Potter* – not the protagonist, but central to plot progression and character development.
    
    Examples of female NON-major characters:
    - **Prim Everdeen** in **The Hunger Games** – important to Katniss's emotional motivation, but doesn’t influence the plot or have significant character development beyond being a catalyst.
    - **Professor McGonagall** in **Harry Potter** – important supporting character, but doesn’t influence the main plot or have significant character development beyond being a mentor and guide to Harry and his friends.
    """

user_prompt_template = "Is there a female major character in the book {title} by {author}?"

In [4]:
# Specify answer format

# Define the tuple type: (0 or 1, followed by a string)
MyFormat = Tuple[conint(ge=0, le=1), str]

# Use RootModel to define the model
class AnswerFormat(RootModel[MyFormat]):
    pass

In [5]:
# Load all books
df_all = pd.read_csv("books.csv")
df_all = df_all.drop(columns=['genres'])
df_all = df_all.drop_duplicates(subset=["title", "author"]).reset_index(drop=True)
df_all = df_all.dropna(subset=["title", "author"])
df_all["author"] = df_all["author"].str.replace("*", "", regex=False).str.strip()
df_all

Unnamed: 0,title,author
0,Jackie Me Baseball Card Adventure 2,"Gutman, Dan"
1,Honus Me A Baseball Card Adventure 1,"Gutman, Dan"
2,Wildflower Hill,"Freeman, Kimberley"
3,Oracle Night,"Auster, Paul"
4,As a Driven Leaf,"Steinberg, Milton"
...,...,...
17999,Truly Madly Deeply,"Kazi, Faraaz"
18000,Sloop of War Richard Bolitho 6,"Kent, Alexander"
18001,Purification Autumn 3,"Moody, David"
18002,The American Heiress,"Goodwin, Daisy"


In [6]:
# Split big all_books up into smaller parts for less risk 
# in 60 minutes, I can run around 1,525 books

# First try for 2 hours, so around 3,050 books
# df_subset_1 = df_all.iloc[:3050].copy()
# df_subset_1

# # Create second subset
# df_subset_2 = df_all.iloc[3050:14000].copy()
# df_subset_2

# Create third subset
df_subset_3 = df_all.iloc[14000:].copy()
df_subset_3

Unnamed: 0,title,author
14000,Scarlet King Raven 2,"Lawhead, Stephen R."
14001,Elsies Endless Wait A Life of Faith Elsie Dins...,"Finley, Martha"
14002,Elsies New Life A Life of Faith Elsie Dinsmore 3,"Finley, Martha"
14003,Tis the Season for Revenge Seasons of Revenge 1,"Elizabeth, Morgan"
14004,A Soul to Keep Duskwalker Brides 1,"Reyne, Opal"
...,...,...
17999,Truly Madly Deeply,"Kazi, Faraaz"
18000,Sloop of War Richard Bolitho 6,"Kent, Alexander"
18001,Purification Autumn 3,"Moody, David"
18002,The American Heiress,"Goodwin, Daisy"


In [7]:
# Set up logging for error tracking
logging.basicConfig(filename='subset3_log.log', level=logging.INFO)

# Setting API key
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

# Headers for the API request
headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# Endpoint for the API
url = "https://api.perplexity.ai/chat/completions"

# Store responses, citations, and titles + authors for progress
responses = []
citations = []
titles = []
authors = []

In [8]:
# Function to send a request to the API
def send_request(title, author, retries=3):
    user_prompt = user_prompt_template.format(title=title, author=author)
    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 200,
        "temperature": 0,  
        "response_format": {
            "type": "json_schema",
            "json_schema": {"schema": AnswerFormat.model_json_schema()},
        },
    }

    try:
        # Make the request to the API
        response = requests.post(url, json=payload, headers=headers)
        
        # Check if the response is successful
        if response.status_code == 200:
            response_json = response.json()
            answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
            citation = response_json.get("citations", [])
            return answer, citation
        else:
            raise Exception(f"API call failed with status code {response.status_code}")
    
    except Exception as e:
        # Log the error and retry if retries left
        logging.error(f"Error with request for {title} by {author}: {str(e)}")
        if retries > 0:
            logging.info(f"Retrying {retries} more times...")
            sleep(60)  # Sleep to avoid hitting rate limits
            return send_request(title, author, retries - 1)  # Retry the request
        else:
            logging.error(f"Failed after retries: {title} by {author}")
            return f"Error: {str(e)}", []

In [9]:
def save_progress():
    if titles:  # Only save if there's something new
        progress_df = pd.DataFrame({
            'title': titles,
            'author': authors,
            'answer': responses,
            'citation': citations
        })

        file_exists = os.path.exists('progress_fmc.csv')

        if file_exists:
            progress_df.to_csv('progress_fmc.csv', mode='a', header=False, index=False)
            logging.info(f"Progress appended: {len(titles)} books.")
        else:
            progress_df.to_csv('progress_fmc.csv', index=False)
            logging.info(f"Progress saved: {len(titles)} books.")

        # Clear the lists so next save only writes new data
        titles.clear()
        authors.clear()
        responses.clear()
        citations.clear()

In [10]:
start_time = time.time()

# Run for df_subset_1
for index, row in df_subset_3.iterrows():
    title = row["title"]
    author = row["author"]
    
    # Send request and handle response
    answer, citation = send_request(title, author)
    
    # Store responses
    responses.append(answer)
    citations.append(citation)
    titles.append(title)
    authors.append(author)

    # Save progress every 10 books (adjust as necessary)
    if (index + 1) % 300 == 0:
        save_progress()

# Save the final progress
save_progress()

# Log the total execution time
end_time = time.time()
execution_time = end_time - start_time
logging.info(f"Completed 3050 book requests in {execution_time:.2f} seconds.")

# First (small) round
$61.97 to $45.21
-> $16.76

3050 -> $16.76
18,004 -> 98,93  (*5.903)

# Second (big) round
$93.72 -> $33.54

In [11]:
subset_3 = pd.read_csv("progress_fmc.csv")

In [12]:
subset_3

Unnamed: 0,title,author,answer,citation
0,Scarlet King Raven 2,"Lawhead, Stephen R.","[0, """"]",['https://www.goodreads.com/book/show/611787.S...
1,Elsies Endless Wait A Life of Faith Elsie Dins...,"Finley, Martha","[1, ""Elsie Dinsmore""]",['https://mooresville.evergreenindiana.org/Rec...
2,Elsies New Life A Life of Faith Elsie Dinsmore 3,"Finley, Martha","[1, ""Elsie Dinsmore""]",['https://en.wikipedia.org/wiki/Elsie_Dinsmore...
3,Tis the Season for Revenge Seasons of Revenge 1,"Elizabeth, Morgan","[1, ""Abbie Keller""]",['https://lifeaccordingtojamie.com/2023/07/09/...
4,A Soul to Keep Duskwalker Brides 1,"Reyne, Opal","[1, ""Reia""]",['https://www.goodreads.com/book/show/61352716...
...,...,...,...,...
3999,Truly Madly Deeply,"Kazi, Faraaz","[1, ""Seema""]",['https://www.goodreads.com/book/show/9996645'...
4000,Sloop of War Richard Bolitho 6,"Kent, Alexander","[0, """"]",['https://www.goodreads.com/book/show/999733.S...
4001,Purification Autumn 3,"Moody, David","[0, """"]",['http://everythingalyce.blogspot.com/2016/04/...
4002,The American Heiress,"Goodwin, Daisy","[1, ""Cora Cash""]",['https://www.bookishwayfarer.com/blog/review-...
