In [1]:
import pandas as pd
import requests
import os 
import ast
import logging
import time
from time import sleep
import json

In [2]:
# Load data
df_fmc = pd.read_csv("data/df_fmc_final.csv")
df_fmc = df_fmc[['title', 'author', 'fmc_name']]
print(df_fmc.shape)

# # (Data frame to test accuracy)
# df_sample = df_fmc.sample(n=100, random_state=26)
# df_sample = df_sample[:2]
# df_sample

(14645, 3)


In [3]:
system_prompt = """
    You are a helpful literary assistant. Your job is to search the internet and assess several aspects for books that are provided to you.

    Most of the questions are about a female major character of the book whose name will be provided alongside the book title. 
    For all questions about the female character, consider any occurrence of the trait or behavior at any point in the book, 
    and give a short explanation for your answer. 

    1. What's the gender of the author? (0 = male, 1 = female, 99 = unclear or non-binary) 
    2. In what year was the book published? (numeric) 
    3. (As a list) What is/are the genre(s) of the book? (list of a maximum of 5 genres in the order of their relevance, try to use common genre names, lower case only) 
    4. Is the female character saved by a male character? (0 = no, 1 = yes, 99 = no information) 
    5. Does the female character save a male character? (0 = no, 1 = yes, 99 = no information) 
    6. Is the female character protected by a male character? (0 = no, 1 = yes, 99 = no information) 
    7. Does the female character protect a male character? (0 = no, 1 = yes, 99 = no information) 
    8. Is the female character’s problem solved through help or luck? (0 = no, 1 = yes, 99 = no information) 
    9. Does the female character solve her own problem through skill? (0 = no, 1 = yes, 99 = no information) 
    10. Is the female character victimized/harmed by a male character? (0 = no, 1 = yes, 99 = no information) 
    11. Is the female character a perpetrator or does she harm a male character? (0 = no, 1 = yes, 99 = no information) 
    12. Does the female character follow orders? (0 = no, 1 = yes, 99 = no information) 
    13. Does the female character give orders? (0 = no, 1 = yes, 99 = no information) 
    14. Is the female character physically weak/incapable? (0 = no, 1 = yes, 99 = no information) 
    15. Is the female character physically strong/capable? (0 = no, 1 = yes, 99 = no information) 
    16. Is the female character fearful/scared of taking risks/challenges? (0 = no, 1 = yes, 99 = no information) 
    17. Is the female character brave/likes taking risks/challenges? (0 = no, 1 = yes, 99 = no information) 
    18: Does the female character care for others (e.g., nurses someone, comforts someone, takes care of children or sick)? (0 = no, 1 = yes, 99 = no information)  
    19: Does the female character not engage in caregiving behaviors (e.g., focuses on tasks or goals without providing emotional or physical support to others)? (0 = no, 1 = yes, 99 = no information) 
    20: Is the female character emotionally aware and attuned to others' needs, moods, or the dynamics between people, and tries to maintain harmony? (0 = no, 1 = yes, 99 = no information) 
    21: Does the female character focus more on practical matters or impersonal concerns than others’ emotions or social dynamics? (0 = no, 1 = yes, 99 = no information) 
    22: Does the female character sacrifice her own goals, desires, or needs to prioritize someone else’s wellbeing or to avoid conflict? (0 = no, 1 = yes, 99 = no information)  
    23: Does the female character maintain personal goals and boundaries, even when others express emotional needs or expectations? (0 = no, 1 = yes, 99 = no information) 
    24. Is the female character a homemaker? (0 = no, 1 = yes, 99 = no information)   
    25. Is the female character a breadwinner? (0 = no, 1 = yes, 99 = no information) 
    26. Is the female character admired for her beauty? (0 = no, 1 = yes, 99 = no information) 
    27. Is the female character admired for her intelligence? (0 = no, 1 = yes, 99 = no information) 
    28. Does the female character have a lower rank occupation (e.g., nurse, assistant, maid, …)? (0 = no, 1 = yes, 99 = no information) 
    29. Does the female character have a higher rank occupation (doctor, manager, ruler, …)? (0 = no, 1 = yes, 99 = no information) 

    For example:
    [
    [1, 1], # first question, female author
    [2, 2011], # second question, publication year is 2011
    [3, ["mystery", "romance", "suspense"]], # third question, genres are mystery, romance, suspense
    [4, 1, "Paul saves Anne from drowning."], # fourth question, answer is yes (so 1), short explanation why 
    [5, 0, "Anne doesn't save any male character."], # fifth question, answer is no (so 0), short explanation why 
    [6, 1, "Anne is protected by her male friend."], # sixth question, answer is yes (so 1), short explanation why
    ...
    ]

"""

user_prompt_template = """
    Please answer the questions defined in your system prompt for the book {title} by {author}.
    The female character in question is {fmc_name}.
"""

In [4]:
json_schema = {
    "type": "object",
    "properties": {
        "author_gender": { "type": "integer", "enum": [0, 1, 99] },  # Q1
        "year": { "type": "integer" },                               # Q2
        "genres": {
            "type": "array",                                   
            "items": { "type": "string" },
            "minItems": 1,
            "maxItems": 5
        },
        # Q4 to Q29 — each with an answer and explanation
        **{
            f"Q{i}": {
                "type": "object",
                "properties": {
                    "answer": { "type": "integer", "enum": [0, 1, 99] },  # 0 = no, 1 = yes, 2 = unknown
                    "explanation": { "type": "string" }
                },
                "required": ["answer", "explanation"]
            } for i in range(4, 30)
        }
    },
    "required": ["gender", "year", "genres"] + [f"Q{i}" for i in range(4, 30)]
}


In [12]:
# Set up logging for error tracking
logging.basicConfig(filename='features_log_3.log', level=logging.INFO)

# Setting API key and headers
YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]

headers = {
    "Authorization": f"Bearer {YOUR_API_KEY}",
    "Content-Type": "application/json"
}

# Endpoint for the API
url = "https://api.perplexity.ai/chat/completions"

# Store responses, citations, and titles + authors for progress
responses = []
citations = []
titles = []
authors = []
fmc_names = []

In [6]:
# Function to send a request to the API
def send_request(title, author, fmc_name, retries=3):

    user_prompt = user_prompt_template.format(title=title, author=author, fmc_name = fmc_name)

    payload = {
        "model": "sonar",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": 2000,
        "temperature": 0,
        "response_format": {
            "type": "json_schema",
            "json_schema": { "schema": json_schema} },
        "web_search_options": { "search_context_size": "medium" }
        }

    try:
        # Make the request to the API
        response = requests.post(url, json=payload, headers=headers)
        
        # Check if the response is successful
        if response.status_code == 200:
            response_json = response.json()
            answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', 'No answer found')
            citation = response_json.get("citations", [])
            return answer, citation
        else:
            raise Exception(f"API call failed with status code {response.status_code}")
    
    except Exception as e:
        # Log the error and retry if retries left
        logging.error(f"Error with request for {title} by {author}: {str(e)}")
        if retries > 0:
            logging.info(f"Retrying {retries} more times...")
            sleep(60)  # Sleep to avoid hitting rate limits
            return send_request(title, author, fmc_name, retries - 1)
        else:
            logging.error(f"Failed after retries: {title} by {author}")
            return f"Error: {str(e)}", []

In [7]:
def save_progress():
    if titles:  # Only save if there's something new
        progress_df = pd.DataFrame({
            'title': titles,
            'author': authors,
            'fmc_name': fmc_names,
            'answer': responses,
            'citation': citations
        })

        file_exists = os.path.exists('progress_features.csv')

        if file_exists:
            progress_df.to_csv('progress_features.csv', mode='a', header=False, index=False)
            logging.info(f"Progress appended: {len(titles)} books.")
        else:
            progress_df.to_csv('progress_features.csv', index=False)
            logging.info(f"Progress saved: {len(titles)} books.")

        # Clear the lists so next save only writes new data
        titles.clear()
        authors.clear()
        responses.clear()
        citations.clear()
        fmc_names.clear()

In [11]:
# Log money
# $57.99 -> 1800 Bücher später bei ................ (didn't work) (200 books = 2.58)
# $55.41 -> 53.38

# Create subsets
# subset_1 = df_fmc[:1800]
# subset_1

# subset_2 = df_fmc[200:400]
# subset_2

subset_3 = df_fmc[400:1600]
subset_3

Unnamed: 0,title,author,fmc_name
400,Midwinterblood,"Sedgwick, Marcus",Merle
401,The Rook The Checquy Files 1,"O'Malley, Daniel",Myfanwy Thomas
402,The Last Nude,"Avery, Ellis",Tamara de Lempicka
403,The Time in Between,"Dueñas, María",Sira Quiroga
404,Dark Gold Dark 3,"Feehan, Christine",Alexandria Houton
...,...,...,...
1595,The Ladies of Covington Send Their Love Ladies...,"Medlicott, Joan",Amelia Declose
1596,From the Heart of Covington Ladies of Covington 3,"Medlicott, Joan",Hannah
1597,At Home in Covington Ladies of Covington 5,"Medlicott, Joan",Amelia Declose
1598,Two Days After the Wedding Ladies of Covington 6,"Medlicott, Joan",Hannah


In [13]:
start_time = time.time()

# Run for df_subset_1
for index, row in subset_3.iterrows():
    title = row["title"]
    author = row["author"]
    fmc_name = row["fmc_name"]

    # Send request and handle response
    answer, citation = send_request(title, author, fmc_name)
    
    # Store responses
    responses.append(answer)
    citations.append(citation)
    titles.append(title)
    authors.append(author)
    fmc_names.append(fmc_name)

    # Save progress every 10 books
    if (index + 1) % 20 == 0:
        save_progress()

# Save the final progress
save_progress()

# Log the total execution time
end_time = time.time()
execution_time = end_time - start_time
logging.info(f"Completed 200 book requests in {execution_time:.2f} seconds.")

KeyboardInterrupt: 

$10.04 (*140) -> $9.23 = 0.81 (*140) -> $113
20 min (*140) -> 2800 min = 46h

In [22]:
df_result = pd.read_csv("progress_features.csv")
df_result

Unnamed: 0,title,author,fmc_name,answer,citation
0,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves,"{\n""author_gender"": 1,\n""year"": 2011,\n""genre...",['https://www.goodreads.com/book/show/49587058...
1,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews,"{\n""author_gender"": 1,\n""year"": 2011,\n""genre...",['https://www.goodreads.com/book/show/19561986...
2,White Girl Problems,"Walker, Babe",Babe Walker,"{\n""author_gender"": 99, ""year"": 2012, ""genres...",['https://www.goodreads.com/book/show/12413949...
3,The Empty House,"Pilcher, Rosamunde",Virginia Keile,"{\n""author_gender"": 1,\n""year"": 1973,\n""genre...",['https://www.goodreads.com/book/show/1054377'...
4,The Unicorn,"Murdoch, Iris",Hannah Crean-Smith,"{\n""author_gender"": 1,\n""year"": 1963,\n""genre...",['https://en.wikipedia.org/wiki/The_Unicorn_(n...
...,...,...,...,...,...
95,Charmed to Death Ophelia Abby 2,"Damsgaard, Shirley",Ophelia Jensen,"{\n""author_gender"": 1,\n""year"": 2006,\n""genre...",['https://www.goodreads.com/book/show/141036.C...
96,Dealova,"Nuranindya, Dyan",Karra,"{\n""author_gender"": 1,\n""year"": 2005,\n""genre...",['https://en.brilio.net/entertainment/dealova-...
97,The Promise Fallen Star 4,"Sorensen, Jessica",Gemma Lucas,"{ ""author_gender"": 1, ""year"": 2012, ""genres"": ...",['https://goodreads.com/book/show/13615578.The...
98,A Boy of Good Breeding,"Toews, Miriam",Knute,"{ ""author_gender"": 1, ""year"": 1998, ""genres"": ...",['https://quillandquire.com/review/a-boy-of-go...


In [18]:
for answer in df_result['answer']:
    print(answer)

 {
"author_gender": 1,
"year": 2011,
"genres": ["mystery", "romance", "suspense", "thriller"],
"Q4": {"answer": 99, "explanation": "No explicit rescue by male characters mentioned in available sources"},
"Q5": {"answer": 99, "explanation": "No evidence of J.J. saving male characters in described plot points"},
"Q6": {"answer": 1, "explanation": "Works closely with Detective Jack Lawson, implying professional protection partnership"},
"Q7": {"answer": 99, "explanation": "No specific examples of protecting male characters in available summaries"},
"Q8": {"answer": 99, "explanation": "Resolution details unclear from available sources"},
"Q9": {"answer": 1, "explanation": "As coroner/detective, uses professional skills to investigate murders"},
"Q10": {"answer": 1, "explanation": "Becomes target in deadly game due to her own secrets"},
"Q11": {"answer": 0, "explanation": "No indication of harming male characters in described plot"},
"Q12": {"answer": 0, "explanation": "Takes initiative in 

In [23]:
type(df_result["answer"].iloc[25])

str

In [None]:
df_result['answer_parsed'] = df_result['answer'].apply(json.loads)
df_answers = pd.json_normalize(df_result['answer_parsed'])
df_answers

In [26]:
df_final = pd.concat([df_result[['title', 'author', 'fmc_name', 'citation']], df_answers], axis=1)
df_final

Unnamed: 0,title,author,fmc_name,citation,author_gender,year,genres,Q4.answer,Q4.explanation,Q5.answer,...,Q25.answer,Q25.explanation,Q26.answer,Q26.explanation,Q27.answer,Q27.explanation,Q28.answer,Q28.explanation,Q29.answer,Q29.explanation
0,Dirty Little Secrets JJ Graves Mystery 1,"Hart, Liliana",J.J. Graves,['https://www.goodreads.com/book/show/49587058...,1,2011,"[mystery, romance, suspense, thriller]",99,No explicit rescue by male characters mentione...,99,...,1,Runs mortuary business and holds coroner position,99,Physical appearance admiration not mentioned,1,Admired for medical expertise and investigativ...,0,Holds high-status positions (coroner/doctor),1,Coroner/doctor roles constitute higher-rank oc...
1,The Tycoons Vacation Baby for the Billionaire 2,"Anne, Melody",Trinity Mathews,['https://www.goodreads.com/book/show/19561986...,1,2011,"[romance, contemporary, billionaire romance, d...",1,Drew provides financial support and medical ca...,0,...,99,Occupation details unspecified beyond financia...,1,Repeatedly described as beautiful by Drew[1][6].,99,Intelligence not specifically highlighted.,99,No occupation details provided.,99,No occupation details provided.
2,White Girl Problems,"Walker, Babe",Babe Walker,['https://www.goodreads.com/book/show/12413949...,99,2012,"[satire, humor, memoir, fiction, chick lit]",0,No male character saves Babe; her rehab journe...,0,...,99,Wealth source unclear; likely inherited rather...,1,"Admired for beauty and style (e.g., designer o...",0,Intelligence is not a noted trait; her decisio...,0,No traditional occupation; her 'work' involves...,0,Holds no high-rank position; fails at ventures...
3,The Empty House,"Pilcher, Rosamunde",Virginia Keile,['https://www.goodreads.com/book/show/1054377'...,1,1973,"[romance, family drama, women's fiction]",0,No evidence of physical rescue by a male chara...,0,...,0,No indication of employment or income generation.,99,Physical appearance not emphasized.,99,Intelligence not specifically highlighted.,0,No occupation mentioned.,0,No high-rank occupation indicated.
4,The Unicorn,"Murdoch, Iris",Hannah Crean-Smith,['https://en.wikipedia.org/wiki/The_Unicorn_(n...,1,1963,"[gothic fiction, philosophical fiction, psycho...",0,"Hannah's attempted escape fails, and she ultim...",0,...,0,No occupational role; maintained as prisoner[6...,1,Consistently described as beautiful and ethere...,0,"Admired for suffering and beauty, not intellec...",0,Holds nominal authority as castle mistress des...,0,No occupational authority due to captive statu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Charmed to Death Ophelia Abby 2,"Damsgaard, Shirley",Ophelia Jensen,['https://www.goodreads.com/book/show/141036.C...,1,2006,"[mystery, paranormal, cozy mystery, fiction, s...",0,Ophelia takes matters into her own hands again...,99,...,99,No specific information about being the primar...,99,Physical appearance isn't emphasized in availa...,1,Her psychic abilities and investigative skills...,0,"Librarian is a professional role, not inherent...",0,Librarian isn't classified as a high-rank occu...
96,Dealova,"Nuranindya, Dyan",Karra,['https://en.brilio.net/entertainment/dealova-...,1,2005,"[young adult, romance, drama, coming-of-age, f...",0,No evidence of Karra being saved by a male cha...,0,...,0,No occupational role mentioned,1,"Described as beautiful with long hair, attract...",99,Intelligence not specifically highlighted,0,No occupation mentioned,0,No occupation mentioned
97,The Promise Fallen Star 4,"Sorensen, Jessica",Gemma Lucas,['https://goodreads.com/book/show/13615578.The...,1,2012,"[urban fantasy, romance, mystery, apocalyptic ...",0,There is no indication that Gemma is saved by ...,0,...,0,Gemma is not described as a breadwinner.,99,There is no mention of Gemma being admired for...,99,There is no mention of Gemma being admired for...,0,Gemma does not have a lower rank occupation.,0,Gemma does not have a higher rank occupation.
98,A Boy of Good Breeding,"Toews, Miriam",Knute,['https://quillandquire.com/review/a-boy-of-go...,1,1998,"[humor, fiction, small-town life]",0,There is no indication that Knute is saved by ...,0,...,1,"As a single mother, Knute likely acts as a bre...",0,There is no mention of Knute being admired for...,0,There is no mention of Knute being admired for...,0,There is no indication that Knute has a lower ...,0,There is no indication that Knute has a higher...


In [None]:
### Unnest columns 

# Check length of all columns 

# Make all list elements into columns