In [14]:
import pandas as pd
import requests
import os 
import ast
import logging
import time
from time import sleep
import json

In [15]:
df_fmc = pd.read_csv("df_fmc.csv")
df_fmc = df_fmc[['title', 'author', 'fmc_name']]
df_sample = df_fmc.sample(n=100, random_state=26)
df_sample

Unnamed: 0,title,author,fmc_name
1325,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves
1649,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews
1372,White Girl Problems,"Walker, Babe",Babe Walker
549,The Empty House,"Pilcher, Rosamunde",Virginia Keile
659,The Unicorn,"Murdoch, Iris",Hannah Crean-Smith
...,...,...,...
2380,Charmed to Death Ophelia Abby 2,"Damsgaard, Shirley",Ophelia Jensen
1935,Dealova,"Nuranindya, Dyan",Karra
1730,The Promise Fallen Star 4,"Sorensen, Jessica",Gemma Lucas
1471,A Boy of Good Breeding,"Toews, Miriam",Knute


In [16]:
# Only check first two books
df_sample = df_sample[:2]
df_sample

Unnamed: 0,title,author,fmc_name
1325,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves
1649,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews


In [None]:
system_prompt = """
    You are a helpful literary assistant. Your job is to search the internet and assess several aspects for books that are provided to you.

    You return a XXXXXXXX for each book, where each element is the answer to a question.
    The question are independent of each other, so if one question is true, the opposite can also be true. 
    The XXXXXXXXXX always needs to have EXACTLY 29 elements.
    Most of the questions are about a female major character of the book whose name will be provided alongside the book title. 
    For all questions about the female character, consider any occurrence of the trait or behavior at any point in the book. 

    1. What's the gender of the author? (0 = male, 1 = female, 99 = unclear or non-binary) 
    2. In what year was the book published? (numeric) 
    3. (As a list) What is/are the genre(s) of the book? (list of a maximum of 5 genres in the order of their relevance, try to use common genre names, lower case only) 
    4. Is the female character saved by a male character? (0 = no, 1 = yes) 
    5. Does the female character save a male character? (0 = no, 1 = yes) 
    6. Is the female character protected by a male character? (0 = no, 1 = yes) 
    7. Does the female character protect a male character? (0 = no, 1 = yes) 
    8. Is the female character’s problem solved through help or luck? (0 = no, 1 = yes)  
    9. Does the female character solve her own problem through skill? (0 = no, 1 = yes) 
    10. Is the female character victimized/harmed by a male character? (0 = no, 1 = yes) 
    11. Is the female character a perpetrator or does she harm a male character? (0 = no, 1 = yes) 
    12. Does the female character follow orders? (0 = no, 1 = yes) 
    13. Does the female character give orders? (0 = no, 1 = yes) 
    14. Is the female character physically weak/incapable? (0 = no, 1 = yes) 
    15. Is the female character physically strong/capable? (0 = no, 1 = yes) 
    16. Is the female character fearful/scared of taking risks/challenges? (0 = no, 1 = yes) 
    17. Is the female character brave/likes taking risks/challenges? (0 = no, 1 = yes)  
    18: Does the female character care for others (e.g., nurses someone, comforts someone, takes care of children or sick)? (0 = no, 1 = yes)  
    19: Does the female character not engage in caregiving behaviors (e.g., focuses on tasks or goals without providing emotional or physical support to others)? (0 = no, 1 = yes) 
    20: Is the female character emotionally aware and attuned to others' needs, moods, or the dynamics between people, and tries to maintain harmony? (0 = no, 1 = yes)  
    21: Does the female character focus more on practical matters or impersonal concerns than others’ emotions or social dynamics? (0 = no, 1 = yes)  
    22: Does the female character sacrifice her own goals, desires, or needs to prioritize someone else’s wellbeing or to avoid conflict? (0 = no, 1 = yes)  
    23: Does the female character maintain personal goals and boundaries, even when others express emotional needs or expectations? (0 = no, 1 = yes)  
    24. Is the female character a homemaker? (0 = no, 1 = yes)  
    25. Is the female character a breadwinner? (0 = no, 1 = yes)  
    26. Is the female character admired for her beauty? (0 = no, 1 = yes)  
    27. Is the female character admired for her intelligence? (0 = no, 1 = yes)  
    28. Does the female character have a lower rank occupation (e.g., nurse, assistant, maid, …)? (0 = no, 1 = yes)  
    29. Does the female character have a higher rank occupation (doctor, manager, ruler, …)? (0 = no, 1 = yes)  

    If you cannot find information for a female-character-related question, answer the question with a 0.

    Example output: 
    [1, 2015, ['fantasy', 'young adult'], 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0]
"""

user_prompt_template = """
    Please answer the 29 questions defined in your system prompt for the book {title} by {author}.
    The female character in question is {fmc_name}. IMPORTANT: Only return a Python list with exactly 29 elements — no explanations, no comments, no extra fields.
"""

In [5]:
json_schema = {
    "type": "object",
    "properties": {
        "gender": { "type": "integer", "enum": [0, 1, 99] },
        "year": { "type": "integer" },
        "genres": {
            "type": "array",
            "items": { "type": "string" },
            "minItems": 1,
            "maxItems": 5
        },
        "feature_1": { "type": "integer", "enum": [0, 1] },
        "feature_2": { "type": "integer", "enum": [0, 1] },
        "feature_3": { "type": "integer", "enum": [0, 1] },
        "feature_4": { "type": "integer", "enum": [0, 1] },
        "feature_5": { "type": "integer", "enum": [0, 1] },
        "feature_6": { "type": "integer", "enum": [0, 1] },
        "feature_7": { "type": "integer", "enum": [0, 1] },
        "feature_8": { "type": "integer", "enum": [0, 1] },
        "feature_9": { "type": "integer", "enum": [0, 1] },
        "feature_10": { "type": "integer", "enum": [0, 1] },
        "feature_11": { "type": "integer", "enum": [0, 1] },
        "feature_12": { "type": "integer", "enum": [0, 1] },
        "feature_13": { "type": "integer", "enum": [0, 1] },
        "feature_14": { "type": "integer", "enum": [0, 1] },
        "feature_15": { "type": "integer", "enum": [0, 1] },
        "feature_16": { "type": "integer", "enum": [0, 1] },
        "feature_17": { "type": "integer", "enum": [0, 1] },
        "feature_18": { "type": "integer", "enum": [0, 1] },
        "feature_19": { "type": "integer", "enum": [0, 1] },
        "feature_20": { "type": "integer", "enum": [0, 1] },
        "feature_21": { "type": "integer", "enum": [0, 1] },
        "feature_22": { "type": "integer", "enum": [0, 1] },
        "feature_23": { "type": "integer", "enum": [0, 1] },
        "feature_24": { "type": "integer", "enum": [0, 1] },
        "feature_25": { "type": "integer", "enum": [0, 1] },
        "feature_26": { "type": "integer", "enum": [0, 1] }
    },
    "required": ["gender", "year", "genres", 
                 "feature_1",  "feature_2",  "feature_3",  "feature_4",  "feature_5",  "feature_6",  "feature_7",
                 "feature_8",  "feature_9",  "feature_10",  "feature_11",  "feature_12",  "feature_13",  "feature_14",
                 "feature_15",  "feature_16", "feature_17",  "feature_18",  "feature_19",  "feature_20",  "feature_21", 
                 "feature_22",  "feature_23",  "feature_24",  "feature_25",  "feature_26"]
}

In [6]:
# Set up logging for error tracking
logging.basicConfig(filename='features_log.log', level=logging.INFO)

# Setting API key and headers
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# Endpoint for the API
url = "https://api.perplexity.ai/chat/completions"

# Store responses, citations, and titles + authors for progress
responses = []
citations = []
titles = []
authors = []
fmc_names = []

In [7]:
# Function to send a request to the API
def send_request(title, author, fmc_name, retries=3):

    user_prompt = user_prompt_template.format(title=title, author=author, fmc_name = fmc_name)

    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 2000,
        "temperature": 0,
        "response_format": {
            "type": "json_schema",
            "json_schema": { "schema": json_schema} }
        }

    try:
        # Make the request to the API
        response = requests.post(url, json=payload, headers=headers)
        
        # Check if the response is successful
        if response.status_code == 200:
            response_json = response.json()
            answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
            citation = response_json.get("citations", [])
            return answer, citation
        else:
            raise Exception(f"API call failed with status code {response.status_code}")
    
    except Exception as e:
        # Log the error and retry if retries left
        logging.error(f"Error with request for {title} by {author}: {str(e)}")
        if retries > 0:
            logging.info(f"Retrying {retries} more times...")
            #sleep(60)  # Sleep to avoid hitting rate limits
            return send_request(title, author, fmc_name, retries - 1)
        else:
            logging.error(f"Failed after retries: {title} by {author}")
            return f"Error: {str(e)}", []

In [8]:
def save_progress():
    if titles:  # Only save if there's something new
        progress_df = pd.DataFrame({
            'title': titles,
            'author': authors,
            'fmc_name': fmc_names,
            'answer': responses,
            'citation': citations
        })

        file_exists = os.path.exists('progress_features.csv')

        if file_exists:
            progress_df.to_csv('progress_features.csv', mode='a', header=False, index=False)
            logging.info(f"Progress appended: {len(titles)} books.")
        else:
            progress_df.to_csv('progress_features.csv', index=False)
            logging.info(f"Progress saved: {len(titles)} books.")

        # Clear the lists so next save only writes new data
        titles.clear()
        authors.clear()
        responses.clear()
        citations.clear()
        fmc_names.clear()

In [9]:
start_time = time.time()

# Run for df_subset_1
for index, row in df_sample.iterrows():
    title = row["title"]
    author = row["author"]
    fmc_name = row["fmc_name"]

    # Send request and handle response
    answer, citation = send_request(title, author, fmc_name)
    
    # Store responses
    responses.append(answer)
    citations.append(citation)
    titles.append(title)
    authors.append(author)
    fmc_names.append(fmc_name)

    # # Save progress every 10 books
    # if (index + 1) % 10 == 0:
    #     save_progress()

# Save the final progress
save_progress()

# Log the total execution time
end_time = time.time()
execution_time = end_time - start_time
logging.info(f"Completed 100 book requests in {execution_time:.2f} seconds.")

# Round 1
$44.4

100 Bücher später: $43.78
0.62 * 140 = 86.8 für alle Bücher

# Round 2
$11.54
-> $10.9

In [10]:
df_result = pd.read_csv("progress_features.csv")
df_result

Unnamed: 0,title,author,fmc_name,answer,citation
0,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves,"{ ""gender"": 1, ""year"": 2011, ""genres"": [""myste...",['https://leaningtoweroftomes.wordpress.com/20...
1,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews,"{ ""gender"": 1, ""year"": 2011, ""genres"": [""roman...",['https://www.goodreads.com/book/show/19561986...
2,White Girl Problems,"Walker, Babe",Babe Walker,"{""gender"":1,""year"":2012,""genres"":[""humor"",""sa...",['https://www.goodreads.com/book/show/12413949...
3,The Empty House,"Pilcher, Rosamunde",Virginia Keile,"{""gender"": 1, ""year"": 0, ""genres"": [""romance""...",['https://www.goodreads.com/book/show/60463.Th...
4,The Unicorn,"Murdoch, Iris",Hannah Crean-Smith,"{""gender"":1,""year"":1963,""genres"":[""literary f...",['https://irismurdochsociety.org.uk/2024/04/17...
...,...,...,...,...,...
95,Charmed to Death Ophelia Abby 2,"Damsgaard, Shirley",Ophelia Jensen,"{ ""gender"": 1, ""year"": 2006, ""genres"": [""myste...",['https://www.goodreads.com/book/show/141036.C...
96,Dealova,"Nuranindya, Dyan",Karra,"{ ""gender"": 1, ""year"": 0, ""genres"": [""teen dra...",['https://en.brilio.net/entertainment/dealova-...
97,The Promise Fallen Star 4,"Sorensen, Jessica",Gemma Lucas,"{ ""gender"": 1, ""year"": 2012, ""genres"": [""fanta...",['https://goodreads.com/book/show/13615578.The...
98,A Boy of Good Breeding,"Toews, Miriam",Knute,"{ ""gender"": 1, ""year"": 2006, ""genres"": [""ficti...",['https://www.goodreads.com/book/show/125896.A...


In [11]:
df_result['answer_parsed'] = df_result['answer'].apply(json.loads)
df_answers = pd.json_normalize(df_result['answer_parsed'])

In [12]:
df_final = pd.concat([df_result[['title', 'author', 'fmc_name', 'citation']], df_answers], axis=1)
df_final

Unnamed: 0,title,author,fmc_name,citation,gender,year,genres,feature_1,feature_2,feature_3,...,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26
0,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves,['https://leaningtoweroftomes.wordpress.com/20...,1,2011,"[mystery, romance, thriller]",0,0,0,...,1,0,1,0,1,0,1,0,1,0
1,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews,['https://www.goodreads.com/book/show/19561986...,1,2011,"[romance, contemporary]",1,0,1,...,1,1,0,1,0,1,1,0,0,1
2,White Girl Problems,"Walker, Babe",Babe Walker,['https://www.goodreads.com/book/show/12413949...,1,2012,"[humor, satire, memoir, fiction, chick lit]",0,0,0,...,0,0,1,0,1,0,1,0,0,1
3,The Empty House,"Pilcher, Rosamunde",Virginia Keile,['https://www.goodreads.com/book/show/60463.Th...,1,0,"[romance, contemporary fiction, family drama]",0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Unicorn,"Murdoch, Iris",Hannah Crean-Smith,['https://irismurdochsociety.org.uk/2024/04/17...,1,1963,"[literary fiction, gothic, psychological fiction]",0,0,1,...,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Charmed to Death Ophelia Abby 2,"Damsgaard, Shirley",Ophelia Jensen,['https://www.goodreads.com/book/show/141036.C...,1,2006,"[mystery, paranormal, fantasy]",0,0,0,...,1,1,0,1,0,0,1,0,0,0
96,Dealova,"Nuranindya, Dyan",Karra,['https://en.brilio.net/entertainment/dealova-...,1,0,"[teen drama, romance]",0,0,0,...,1,0,1,0,1,0,1,0,0,0
97,The Promise Fallen Star 4,"Sorensen, Jessica",Gemma Lucas,['https://goodreads.com/book/show/13615578.The...,1,2012,"[fantasy, young adult, paranormal]",0,1,1,...,1,1,1,1,0,1,1,0,0,0
98,A Boy of Good Breeding,"Toews, Miriam",Knute,['https://www.goodreads.com/book/show/125896.A...,1,2006,"[fiction, humor, literary fiction]",0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [13]:
# Check for missings in any of the rows 
df_final.isnull().sum()


title         0
author        0
fmc_name      1
citation      0
gender        0
year          0
genres        0
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
feature_6     0
feature_7     0
feature_8     0
feature_9     0
feature_10    0
feature_11    0
feature_12    0
feature_13    0
feature_14    0
feature_15    0
feature_16    0
feature_17    0
feature_18    0
feature_19    0
feature_20    0
feature_21    0
feature_22    0
feature_23    0
feature_24    0
feature_25    0
feature_26    0
dtype: int64

In [1]:
df_final

NameError: name 'df_final' is not defined

## Round 1
Start: $45.21
After 10: $45.18
So ... for 14.400

## Round 2
Start: $45.18
After 10: $45.12
So ... for 14.400

## Round 3
Start: $45.12
After 100: $44.74

----------------------------------------------------