In [1]:
import pandas as pd
import requests
import os 
import ast

In [None]:
df_fmc = pd.read_csv("df_fmc.csv")
df_fmc = df_fmc[['title', 'author', 'fmc_name']]
print(df_fmc.head())

                                              title              author  \
0  The Mysterious Disappearance of Leon I Mean Noel       Raskin, Ellen   
1         The Summer I Turned Pretty The Collection        Han, Jenny *   
2                       Complete Me Stark Trilogy 3        Kenner, J. *   
3         One More Chance Rosemary Beach 8 Chance 2      Glines, Abbi *   
4                 Sign Off Caught Dead in Wyoming 1  McLinn, Patricia *   

                     fmc_name  
0               Mrs. Carillon  
1               Belly Conklin  
2                       Nikki  
3              Harlow Manning  
4  Elizabeth 'E. M.' Danniher  


In [28]:
system_prompt = """
    You are a helpful literary assistant. Your job is to search the internet and assess several aspects for books that are provided to you.

    You return a Python list for each book, where each element is the answer to a question. The list always needs to have exactly 27 elements.
    Most of the questions are about a female major character of the book whose name will be provided alongside the book title. 
    For all questions about the female character, consider any occurrence of the trait or behavior at any point in the book.

    1. What's the gender of the author? (0 = male, 1 = female)  
    2. In what year was the book published? (numeric)  
    3. (As a list) What is/are the genre(s) of the book? (list of a maximum of 5 genres in the order of their relevance, less are okay too)  
    4. Is the female character at any point of the story saved by a male character? (0 = no, 1 = yes)  
    5. Does the female character at any point of the story save a male character? (0 = no, 1 = yes)  
    6. Is the female character protected by a male character? (0 = no, 1 = yes)  
    7. Does the female character protect a male character? (0 = no, 1 = yes)  
    8. Is the female character’s problem is solved through help or luck? (0 = no, 1 = yes)  
    9. Does the female character solve her own problem through skill? (0 = no, 1 = yes)  
    10. Is the female character victimized/harmed by a male character? (0 = no, 1 = yes)  
    11. Is the female character a perpetrator or does she harm a male character? (0 = no, 1 = yes)  
    12. Does the female character follow orders? (0 = no, 1 = yes)  
    13. Does the female character give orders? (0 = no, 1 = yes)  
    14. Is the female character admired for her beauty? (0 = no, 1 = yes)  
    15. Is the female character admired for her intelligence? (0 = no, 1 = yes)  
    16. Is the female character a homemaker? (0 = no, 1 = yes)  
    17. Is the female character a breadwinner? (0 = no, 1 = yes)  
    18. Is the female character skilled in domestic tasks? (0 = no, 1 = yes)  
    19. Is the female character not skilled or uninterested in domestic tasks? (0 = no, 1 = yes)  
    20. Does the female character have a lower rank occupation (e.g., nurse, assistant, maid, …)? (0 = no, 1 = yes)  
    21. Does the female character have a higher rank occupation (doctor, manager, ruler, …)? (0 = no, 1 = yes)  
    22. Is the female character physically weak/incapable? (0 = no, 1 = yes)  
    23. Is the female character physically strong/capable? (0 = no, 1 = yes)  
    24. Does the female character have low self-esteem/is emotionally fragile? (0 = no, 1 = yes)  
    25. Does the female character have high self-esteem/is emotionally strong? (0 = no, 1 = yes)  
    26. Is the female character scared of taking risks/challenges? (0 = no, 1 = yes)  
    27. Does the female character like taking risks/challenges? (0 = no, 1 = yes)  

    If you cannot find information for a female-character-related question, answer the question with a 0.

    Example output: 
    [1, 2015, ['fantasy', 'young adult'], 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1]
"""

user_prompt_template = """
    Please answer the questions defined in your system prompt for the book {title} by {author}.
    The female character in question is {fmc_name}.
"""

In [44]:
# Specify answer format

from pydantic import RootModel
from typing import List, Literal, Tuple

class AnswerFormat(RootModel):
    root: Tuple[
        Literal[0, 1],  # 1st: 0 or 1
        int,            # 2nd: year
        List[str],      # 3rd: list of genres or tags
        Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1],
        Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1],
        Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1],
        Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1],
        Literal[0, 1], Literal[0, 1], Literal[0, 1], Literal[0, 1]
    ]  # Total = 27 items


In [45]:
# Setting my API key
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

# Saying hi it's me, and this is what I'll be sending
headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# This is where I want to send it to
url = "https://api.perplexity.ai/chat/completions"

# I'll store the responses I get in this (as of now, empty) list
responses = []

# I'll store the citations for each answer in this list
citations = []

# Setting up what I want to send to Perplexity
for index, row in df_fmc.iterrows():
    # Extract title and author from each row 
    title = row["title"]
    author = row["author"]
    fmc_name = row["fmc_name"]

    # Format user prompt with current title and author
    user_prompt = user_prompt_template.format(title=title, author=author, fmc_name=fmc_name)

    # Payload
    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 2000,
        "temperature": 0,  # Controls randomness in the response
        # "top_p": 0.9,
        # "search_domain_filter": ["<any>"],
        # "return_images": False,
        # "return_related_questions": False,
        # "search_recency_filter": "<string>",
        # "top_k": 0,
        # "stream": False,
        # "presence_penalty": 0,
        # "frequency_penalty": 1,
        "response_format": {
          "type": "json_schema",
          "json_schema": {"schema": AnswerFormat.model_json_schema()},
        },
        # "web_search_options": {"search_context_size": "high"}
    }

    # Send the request to the Perplexity API
    response = requests.post(url, json=payload, headers=headers)
    
    # Check the response
    if response.status_code == 200:
        response_json = response.json()
        # Extract the answer from the response
        answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
        # Extract sources
        citation = response_json.get("citations", [])
        # Store answer and sources
        responses.append(answer)
        citations.append(citation)
    else:
        # Handle errors by appending an error message
        responses.append(f"Error: {response.status_code}")

df_fmc['responses'] = responses
df_fmc['sources'] = citations

print(df_fmc)

                                               title  \
0   The Mysterious Disappearance of Leon I Mean Noel   
1          The Summer I Turned Pretty The Collection   
2                        Complete Me Stark Trilogy 3   
3          One More Chance Rosemary Beach 8 Chance 2   
4                  Sign Off Caught Dead in Wyoming 1   
..                                               ...   
72     A Song of Ice and Fire series 5Book Boxed Set   
73                               The Age of Miracles   
74                     London Match Bernard Samson 3   
75                       The Eternal Flame Merlin 11   
76    The Black Cauldron The Chronicles of Prydain 2   

                      author                    fmc_name  \
0              Raskin, Ellen               Mrs. Carillon   
1               Han, Jenny *               Belly Conklin   
2               Kenner, J. *                       Nikki   
3             Glines, Abbi *              Harlow Manning   
4         McLinn, Patricia 

In [None]:
# Reponses column is string, convert to list
df_fmc["responses"] = df_fmc["responses"].apply(ast.literal_eval)

In [59]:
def validate_response_format(row):
    r = row['responses']

    # Check that it's a list and has length 27
    if not isinstance(r, list) or len(r) != 27:
        return False

    try:
        # r[0] should be 0 or 1
        if r[0] not in [0, 1]:
            return False

        # r[1] should be an int
        if not isinstance(r[1], int):
            return False

        # r[2] should be a list of strings
        if not isinstance(r[2], list):
            return False
        if not all(isinstance(tag, str) for tag in r[2]):
            return False

        # r[3:] should all be 0 or 1
        if not all(isinstance(x, int) and x in [0, 1] for x in r[3:]):
            return False

    except Exception as e:
        print(f"⚠️ Issue in row: {row['title']} — Error: {e}")
        return False

    return True
df_fmc['is_valid'] = df_fmc.apply(validate_response_format, axis=1)
invalid_rows = df_fmc[~df_fmc['is_valid']]
print(f"Number of invalid rows: {len(invalid_rows)}")
print(invalid_rows[['title', 'responses']])


Number of invalid rows: 0
Empty DataFrame
Columns: [title, responses]
Index: []


### Cost Overview  
**Round 1**
* started with $5.03
* ran for 77 books
* left with $4.83
* so $0.2 for 77 books
* so $0.26 for 100 books
* We'll have around 80% of the original 18027 books, so ~14.421 books
* so around $37,45 for all books 
  
**Round 2**
* started with $4.12
* ran for 77 books
* left with $3.63
* so $0.49 for 77 books 

In [60]:
# Save data frame
df_fmc.to_csv('df_fmc_features.csv', index=False)