In [19]:
import pandas as pd
import requests
import os 

In [20]:
df10 = pd.read_csv('books10.csv')

In [21]:
system_prompt = """
    You are a helpful literary assistant, helping me with my master thesis on gender in literature. 
    Your job is to search the internet and assess whether a book features a **female major character** (fmc).
    
   You MUST ALWAYS return your response as a Python list: [fmc_present, fmc_name]
    - `fmc_present` = 1 if yes, 0 if no
    - `fmc_name` = the most important female major character's name, or an empty string.

    Never return anything else. No explanations, no full sentences. Just the list.

    Examples:
    - For "To Kill A Mockingbird" by Harper Lee (has a fmc): [1, 'Scout Finch']
    - For "The Hobbit and The Lord of the Rings" by J.R.R. Tolkien (doesn't have a fmc): [0, '']

    The female major character does not need to be the protagonist, but she must be a central presence in the narrative, 
    with more significance than a secondary or background character.

    A female character is considered a **major character** if she plays a substantial role in the story, 
    regularly appears throughout the book, and if her role significantly contributes to relationships or events that 
    shape the plot or other characters.

    Characters who serve only as symbolic figures, plot devices, or one-time motivators without meaningful presence 
    or development should not be considered major characters.

    Examples of female major characters:
    - **Katniss Everdeen** in **The Hunger Games** – protagonist, central to the plot, her choices drive the rebellion and story progression.
    – **Hermione Granger** in *Harry Potter* – not the protagonist, but central to plot progression and character development.
    
    Examples of female NON-major characters:
    - **Prim Everdeen** in **The Hunger Games** – important to Katniss's emotional motivation, but doesn’t influence the plot or have significant character development beyond being a catalyst.
    - **Professor McGonagall** in **Harry Potter** – important supporting character, but doesn’t influence the main plot or have significant character development beyond being a mentor and guide to Harry and his friends.
    """

user_prompt_template = "Is there a female main character in the book {title} by {author}?"

In [22]:
# Find 20 novels, check by hand

# Original data with an extra column
books = [
    {"Title": "Pride and Prejudice", "Author": "Jane Austen", "Female_Major_Character": 1},  # Elizabeth Bennet
    {"Title": "To Kill a Mockingbird", "Author": "Harper Lee", "Female_Major_Character": 1},  # Scout Finch
    {"Title": "The Book Thief", "Author": "Markus Zusak", "Female_Major_Character": 1},  # Liesel Meminger
    {"Title": "Twilight", "Author": "Stephenie Meyer", "Female_Major_Character": 1},  # Bella Swan
    {"Title": "The Fault in Our Stars", "Author": "John Green", "Female_Major_Character": 1},  # Hazel Grace Lancaster
    {"Title": "The Perks of Being a Wallflower", "Author": "Stephen Chbosky", "Female_Major_Character": 1},  # Sam
    {"Title": "Brave New World", "Author": "Aldous Huxley", "Female_Major_Character": 1}, #Lenina Crowne
    {"Title": "A Game of Thrones (A Song of Ice and Fire, #1)", "Author": "George R.R. Martin", "Female_Major_Character": 1},  # Daenerys Targaryen, Arya Stark, Cersei Lannister, ...
    {"Title": "The Lightning Thief", "Author": "Rick Riordan", "Female_Major_Character": 1},  # Annabeth Chase
    {"Title": "Lolita", "Author": "Vladimir Nabokov", "Female_Major_Character": 1},  # Dolores "Lolita" Haze

    {"Title": "The Hobbit and The Lord of the Rings", "Author": "J.R.R. Tolkien", "Female_Major_Character": 0},
    {"Title": "The Little Prince", "Author": "Antoine de Saint-Exupéry", "Female_Major_Character": 0},
    {"Title": "Crime and Punishment", "Author": "Fyodor Dostoevsky", "Female_Major_Character": 0},  
    {"Title": "Lord of the Flies", "Author": "William Golding", "Female_Major_Character": 0}, 
    {"Title": "The Adventures of Huckleberry Finn", "Author": "Mark Twain", "Female_Major_Character": 0}, 
    {"Title": "The Old Man and the Sea", "Author": "Ernest Hemingway", "Female_Major_Character": 0}, 
    {"Title": "Alex Rider", "Author": "Anthony Horowitz", "Female_Major_Character": 0},  
    {"Title": "Der Trafikant", "Author": "Robert Seethaler", "Female_Major_Character": 0}, 
]

# Create the DataFrame
df_books = pd.DataFrame(books)

# Display the updated DataFrame
print(df_books)

                                             Title                    Author  \
0                              Pride and Prejudice               Jane Austen   
1                            To Kill a Mockingbird                Harper Lee   
2                                   The Book Thief              Markus Zusak   
3                                         Twilight           Stephenie Meyer   
4                           The Fault in Our Stars                John Green   
5                  The Perks of Being a Wallflower           Stephen Chbosky   
6                                  Brave New World             Aldous Huxley   
7   A Game of Thrones (A Song of Ice and Fire, #1)        George R.R. Martin   
8                              The Lightning Thief              Rick Riordan   
9                                           Lolita          Vladimir Nabokov   
10            The Hobbit and The Lord of the Rings            J.R.R. Tolkien   
11                               The Lit

In [23]:
from pydantic import RootModel, conint
from typing import Tuple

# Define the tuple type: (0 or 1, followed by a string)
MyFormat = Tuple[conint(ge=0, le=1), str]

# Use RootModel to define the model
class AnswerFormat(RootModel[MyFormat]):
    pass

In [10]:
# Setting my API key
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

# Saying hi it's me, and this is what I'll be sending
headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# This is where I want to send it to
url = "https://api.perplexity.ai/chat/completions"

# I'll store the responses I get in this (as of now, empty) list
responses = []

# Setting up what I want to send to Perplexity
for index, row in df_books.iterrows():
    # Extract title and author from each row 
    title = row["Title"]
    author = row["Author"]

    # Format user prompt with current title and author
    user_prompt = user_prompt_template.format(title=title, author=author)

    # Payload
    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 200,
        "temperature": 0,  # Controls randomness in the response
        # "top_p": 0.9,
        # "search_domain_filter": ["<any>"],
        # "return_images": False,
        # "return_related_questions": False,
        # "search_recency_filter": "<string>",
        # "top_k": 0,
        # "stream": False,
        # "presence_penalty": 0,
        # "frequency_penalty": 1,
        "response_format": {
          "type": "json_schema",
          "json_schema": {"schema": AnswerFormat.model_json_schema()},
        },
        # "web_search_options": {"search_context_size": "high"}
    }

    # Send the request to the Perplexity API
    response = requests.post(url, json=payload, headers=headers)
    
    # Check the response
    if response.status_code == 200:
        response_json = response.json()
        # Extract the answer from the response (this might vary depending on the response format)
        answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
        responses.append(answer)  # Store the answer
    else:
        # Handle errors by appending an error message
        responses.append(f"Error: {response.status_code}")

# Add the responses to your original dataframe
df_books['test'] = responses

# Print the updated dataframe to check the results
print(df_books)

                                             Title                    Author  \
0                              Pride and Prejudice               Jane Austen   
1                            To Kill a Mockingbird                Harper Lee   
2                                   The Book Thief              Markus Zusak   
3                                         Twilight           Stephenie Meyer   
4                           The Fault in Our Stars                John Green   
5                  The Perks of Being a Wallflower           Stephen Chbosky   
6                                  Brave New World             Aldous Huxley   
7   A Game of Thrones (A Song of Ice and Fire, #1)        George R.R. Martin   
8                              The Lightning Thief              Rick Riordan   
9                                           Lolita          Vladimir Nabokov   
10            The Hobbit and The Lord of the Rings            J.R.R. Tolkien   
11                               The Lit

There is one case that I marked to not have a female character, where Perplexity indicated the presence of a major female character. I would say that for this case (Crime and Punishment by Fyodor Dostoyevski) it's a grey area. I don't think we risk anything by including this case and cases similar to this one. Overall, the female major character annotation seems to work well. 

Next, I want to try whether instead of looping through one case at a time (which means a single request/payload for every book), I can also pass a batch of around 10 books within one request. This would be cheaper, since we have to pay for the API calls per request.

In [24]:
# Batch

# Make small df
all_books = pd.read_csv("books.csv")
df100 = all_books.sample(n=100, random_state=42)
print(df100.head)

<bound method NDFrame.head of                                                    title  \
3214    The Mysterious Disappearance of Leon I Mean Noel   
5913           The Summer I Turned Pretty The Collection   
4397                         Complete Me Stark Trilogy 3   
4638           One More Chance Rosemary Beach 8 Chance 2   
16166                  Not a Penny More Not a Penny Less   
...                                                  ...   
15985                     Fear and Loathing in Las Vegas   
7624      The Black Cauldron The Chronicles of Prydain 2   
16328  Doms of Dark Haven Truckee Wolves 2 Hawkeye 25...   
13581                   The Will of the Many Hierarchy 1   
6260           Divide and Conquer Tom Clancys OpCenter 7   

                     author                                             genres  
3214          Raskin, Ellen  ['Mystery', 'Fiction', 'Young Adult', 'Childre...  
5913           Han, Jenny *  ['Romance', 'Young Adult', 'Contemporary', 'Fi...  
4397  

In [27]:
import pandas as pd
import requests
from tqdm import tqdm
import json

# Create batches
batch_size = 5
batches = [df100.iloc[i:i + batch_size] for i in range(0, len(df100), batch_size)]

# Setting my API key
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

# Saying hi it's me, and this is what I'll be sending
headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# This is where I want to send it to
url = "https://api.perplexity.ai/chat/completions"

results = []

for batch in tqdm(batches, desc="Processing batches 🪄"):
    for _, row in batch.iterrows():
        title = str(row['title'])
        author = str(row['author'])

        if not title or not author or "unknown" in title.lower():
            continue  # Skip empty/messy values

        user_prompt = f"Is there a female main character in the book '{title}' by '{author}'?"

        payload = {
            "model": "sonar",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        }

        try:
            response = requests.post(url, json=payload, headers=headers)
            response.raise_for_status()
            content = response.json()['choices'][0]['message']['content']

            try:
                parsed = eval(content)  # risky, but works for controlled outputs
                results.append({
                    "title": title,
                    "author": author,
                    "fmc_present": parsed[0],
                    "fmc_name": parsed[1]
                })
            except Exception as e:
                print(f"⚠️ Could not parse response for '{title}': {content} | Error: {e}")

        except Exception as e:
            print(f"🚨 API call failed for '{title}': {e}")

# Optional: save to CSV or DataFrame
df_results = pd.DataFrame(results)
#df_results.to_csv("fmc_results.csv", index=False)


Processing batches 🪄:   0%|          | 0/20 [00:00<?, ?it/s]

Processing batches 🪄: 100%|██████████| 20/20 [04:20<00:00, 13.02s/it]


In [28]:
print(df_results)

                                                title                author  \
0    The Mysterious Disappearance of Leon I Mean Noel         Raskin, Ellen   
1           The Summer I Turned Pretty The Collection          Han, Jenny *   
2                         Complete Me Stark Trilogy 3          Kenner, J. *   
3           One More Chance Rosemary Beach 8 Chance 2        Glines, Abbi *   
4                   Not a Penny More Not a Penny Less     Archer, Jeffrey *   
..                                                ...                   ...   
95                     Fear and Loathing in Las Vegas   Thompson, Hunter S.   
96     The Black Cauldron The Chronicles of Prydain 2      Alexander, Lloyd   
97  Doms of Dark Haven Truckee Wolves 2 Hawkeye 25...  Cartwright, Sierra *   
98                   The Will of the Many Hierarchy 1    Islington, James *   
99          Divide and Conquer Tom Clancys OpCenter 7         Rovin, Jeff *   

    fmc_present               fmc_name  
0         

In [None]:
# Save only those books with a 1 on female_character_present