In [None]:
import pandas as pd
import requests
import os 
import ast
import time
from time import sleep
import logging
from pydantic import RootModel, conint
from typing import Tuple

In [10]:
system_prompt = """
    You are a helpful literary assistant, helping me with my master thesis on gender in literature. 
    Your job is to search the internet and assess whether a book features a **female major character** (fmc).
    
   You MUST ALWAYS return your response as a Python list with exactly two elements: [fmc_present, fmc_name]
    - `fmc_present` = 1 if yes, 0 if no
    - `fmc_name` = the most important female major character's name, or an empty string.

    Never return anything else. No explanations, no full sentences. Just the list.

    Examples:
    - For "To Kill A Mockingbird" by Harper Lee (has a fmc): [1, 'Scout Finch']
    - For "The Hobbit and The Lord of the Rings" by J.R.R. Tolkien (doesn't have a fmc): [0, '']

    The female major character does not need to be the protagonist, but she must be a central presence in the narrative, 
    with **more significance than a secondary or background character**.

    A female character is considered a **major character** if she plays a substantial role in the story, 
    regularly appears throughout the book, and if her role significantly contributes to relationships or events that 
    shape the plot or other characters.

    Characters who serve only as symbolic figures, plot devices, or one-time motivators without meaningful presence 
    or development should not be considered major characters.

    Examples of female major characters:
    - **Katniss Everdeen** in **The Hunger Games** – protagonist, central to the plot, her choices drive the rebellion and story progression.
    – **Hermione Granger** in *Harry Potter* – not the protagonist, but central to plot progression and character development.
    
    Examples of female NON-major characters:
    - **Prim Everdeen** in **The Hunger Games** – important to Katniss's emotional motivation, but doesn’t influence the plot or have significant character development beyond being a catalyst.
    - **Professor McGonagall** in **Harry Potter** – important supporting character, but doesn’t influence the main plot or have significant character development beyond being a mentor and guide to Harry and his friends.
    """

user_prompt_template = "Is there a female major character in the book {title} by {author}?"

In [4]:
# Specify answer format

# Define the tuple type: (0 or 1, followed by a string)
MyFormat = Tuple[conint(ge=0, le=1), str]

# Use RootModel to define the model
class AnswerFormat(RootModel[MyFormat]):
    pass

In [5]:
# Create data frame of 103 random books 
all_books = pd.read_csv("books.csv")
df100 = all_books.sample(n=103, random_state=42)

In [6]:
# Set up logging for error tracking
logging.basicConfig(filename='perplexity_api_log.log', level=logging.INFO)

# Setting API key
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

# Headers for the API request
headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# Endpoint for the API
url = "https://api.perplexity.ai/chat/completions"

# Store responses, citations, and titles + authors for progress
responses = []
citations = []
titles = []
authors = []

In [7]:
# Function to send a request to the API
def send_request(title, author, retries=3):
    user_prompt = user_prompt_template.format(title=title, author=author)
    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 200,
        "temperature": 0,  
        "response_format": {
            "type": "json_schema",
            "json_schema": {"schema": AnswerFormat.model_json_schema()},
        },
    }

    try:
        # Make the request to the API
        response = requests.post(url, json=payload, headers=headers)
        
        # Check if the response is successful
        if response.status_code == 200:
            response_json = response.json()
            answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
            citation = response_json.get("citations", [])
            return answer, citation
        else:
            raise Exception(f"API call failed with status code {response.status_code}")
    
    except Exception as e:
        # Log the error and retry if retries left
        logging.error(f"Error with request for {title} by {author}: {str(e)}")
        if retries > 0:
            logging.info(f"Retrying {retries} more times...")
            sleep(60)  # Sleep to avoid hitting rate limits
            return send_request(title, author, retries - 1)  # Retry the request
        else:
            logging.error(f"Failed after retries: {title} by {author}")
            return f"Error: {str(e)}", []

In [8]:
def save_progress():
    if titles:  # Only save if there's something new
        progress_df = pd.DataFrame({
            'title': titles,
            'author': authors,
            'answer': responses,
            'citation': citations
        })

        file_exists = os.path.exists('progress.csv')

        if file_exists:
            progress_df.to_csv('progress.csv', mode='a', header=False, index=False)
            logging.info(f"Progress appended: {len(titles)} books.")
        else:
            progress_df.to_csv('progress.csv', index=False)
            logging.info(f"Progress saved: {len(titles)} books.")

        # Clear the lists so next save only writes new data
        titles.clear()
        authors.clear()
        responses.clear()
        citations.clear()

In [9]:
start_time = time.time()

# Test for 100 books -------------------
for index, row in df100.iterrows():
    title = row["title"]
    author = row["author"]
    
    # Send request and handle response
    answer, citation = send_request(title, author)
    
    # Store responses
    responses.append(answer)
    citations.append(citation)
    titles.append(title)
    authors.append(author)

    # Save progress every 10 books (adjust as necessary)
    if (index + 1) % 10 == 0:
        save_progress()

# Save the final progress
save_progress()

# Log the total execution time
end_time = time.time()
execution_time = end_time - start_time
logging.info(f"Completed 100 book requests in {execution_time:.2f} seconds.")

In [None]:
# # Save only those books with a 1 on female_character_present

# # Unnest response column to two columns
# df100["responses"] = df100["responses"].apply(ast.literal_eval) # reponses column is string, convert to list
# df100[["fmc_present", "fmc_name"]] = pd.DataFrame(df100["responses"].tolist(), index=df100.index)

# # Create new data frame with fmcs only
# df_fmc = df100[df100['fmc_present'] == 1].copy()

In [None]:
# # Save data frame
# df_fmc.to_csv('df_fmc.csv', index=False)