# 📘 Notebook 01: Data Collection
# Emotional Geography of Books – Shraddha

This notebook collects and processes data related to books and authors.
It fetches author metadata, extracts country and gender information,
and enriches the author data with this additional information.

In [110]:
import requests
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
from time import sleep, time
import asyncio
import nest_asyncio
import aiohttp
from bs4 import BeautifulSoup, NavigableString, Tag
from typing import Tuple
from dotenv import load_dotenv


HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}
MAX_CONCURRENCY = 30
TIMEOUT = aiohttp.ClientTimeout(total=15)
nest_asyncio.apply()

# Helper to call NamSor
load_dotenv()  
NAMSOR_API_KEY = os.getenv("NAMSOR_API_KEY")
NAMSOR_URL = "https://v2.namsor.com/NamSorAPIv2/api2/json/genderFull/"

In [41]:
#Read data from raw data
files = list(Path("../data/raw").glob("goodreads_books_*.csv"))
all_dfs = []

for file in files:
    year = int(file.stem.split("_")[-1])
    df = pd.read_csv(file)
    df["published_year"] = year
    all_dfs.append(df)

df_all = pd.concat(all_dfs, ignore_index=True)
print(f"📚 Total books: {len(df_all)}")

📚 Total books: 1015


In [42]:
# Remove " ratings" text and convert to int
def clean_ratings_count(value):
    if pd.isna(value):
        return 0
    value = value.lower().replace("ratings", "").strip()
    multipliers = {"k": 1_000, "m": 1_000_000}
    for suffix, multiplier in multipliers.items():
        if value.endswith(suffix):
            return int(float(value[:-1]) * multiplier)
    try:
        return int(value.replace(",", ""))
    except:
        return 0

df_all["ratings_count"] = df_all["ratings_count"].apply(clean_ratings_count)

# Ratings to float
df_all["rating"] = pd.to_numeric(df_all["rating"], errors="coerce")

# Author name cleanup + first name
df_all["author"] = df_all["author"].fillna("").str.strip()
df_all["author_first"] = df_all["author"].apply(lambda x: x.split()[0].lower() if x else "")

In [None]:
df_all["source"] = "Goodreads"


✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv


### Basic EDA

In [64]:
df_all.sample(10)

Unnamed: 0,title,author,link,rating,ratings_count,description,published_year,author_first,source
736,Lapvona,Ottessa Moshfegh,https://www.goodreads.com/book/show/59693959-l...,3.53,85600,In a village in a medieval fiefdom buffeted by...,2022,ottessa,Goodreads
404,"Powerless (Powerless, #1)",Lauren Roberts,https://www.goodreads.com/book/show/232389739-...,4.17,667000,Ella es lo que él ha pasado toda su vida cazan...,2023,lauren,Goodreads
387,Still Life,Sarah Winman,https://www.goodreads.com/book/show/57001545-s...,4.16,83300,"Tuscany, 1944: As Allied troops advance and bo...",2021,sarah,Goodreads
298,How to Kill Your Family,Bella Mackie,https://www.goodreads.com/book/show/50224049-h...,3.53,164000,"I have killed several people (some brutally, o...",2021,bella,Goodreads
890,The Third Gilmore Girl,Kelly Bishop,https://www.goodreads.com/book/show/207298106-...,4.35,85100,Brought to you by Penguin. Beloved award-winn...,2024,kelly,Goodreads
211,Better Than the Movies (Better Than the Movies...,Lynn Painter,https://www.goodreads.com/book/show/55710822-b...,4.27,613000,"In this rom-com about rom-coms, in the spirit ...",2021,lynn,Goodreads
565,"Crossed (Never After, #5)",Emily McIntire,https://www.goodreads.com/book/show/80340956-c...,3.7,60700,From bestselling author Emily McIntire comes a...,2023,emily,Goodreads
475,The House of Eve,Sadeqa Johnson,https://www.goodreads.com/book/show/61273858-t...,4.25,122000,"From the award-winning author of Yellow Wife, ...",2023,sadeqa,Goodreads
753,Do You Remember?,Freida McFadden,https://www.goodreads.com/book/show/59817247-d...,3.98,105000,Tess Strebel can’t recognize her own face.She ...,2022,freida,Goodreads
103,"The Burning God (The Poppy War, #3)",R.F. Kuang,https://www.goodreads.com/book/show/45857086-t...,4.3,132000,"The exciting end to The Poppy War trilogy, R. ...",2020,r.f.,Goodreads


In [65]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1015 entries, 0 to 1014
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1015 non-null   object 
 1   author          1015 non-null   object 
 2   link            1015 non-null   object 
 3   rating          1015 non-null   float64
 4   ratings_count   1015 non-null   int64  
 5   description     1015 non-null   object 
 6   published_year  1015 non-null   int64  
 7   author_first    1015 non-null   object 
 8   source          1015 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 71.5+ KB


In [45]:
df_all.describe(include="all")

Unnamed: 0,title,author,link,rating,ratings_count,description,published_year,author_first,source
count,1015,1015,1015,1015.0,1015.0,1015,1015.0,1015,1015
unique,998,600,1000,,,1000,,429,1
top,"Unfortunately Yours (A Vine Mess, #2)",Freida McFadden,https://www.goodreads.com/book/show/124936017-...,,,The war has begun. Both sides demand Diem’s al...,,jennifer,Goodreads
freq,2,18,2,,,2,,22,1015
mean,,,,3.981429,199946.8,,2022.014778,,
std,,,,0.275918,260503.0,,1.409596,,
min,,,,2.79,11200.0,,2020.0,,
25%,,,,3.79,76600.0,,2021.0,,
50%,,,,4.0,115000.0,,2022.0,,
75%,,,,4.17,200500.0,,2023.0,,


In [66]:
#Count number of rows per published_year
df_all["published_year"].value_counts()

published_year
2023    215
2020    200
2021    200
2022    200
2024    200
Name: count, dtype: int64

In [67]:
df_all.duplicated().sum()

15

In [68]:
#Remove duplicates
df_all.drop_duplicates(inplace=True)

In [69]:
df_all.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
title,1000.0,998.0,One by One,2.0,,,,,,,
author,1000.0,600.0,Freida McFadden,18.0,,,,,,,
link,1000.0,1000.0,https://www.goodreads.com/book/show/52578297-t...,1.0,,,,,,,
rating,1000.0,,,,3.98327,0.275731,2.79,3.79,4.0,4.17,4.76
ratings_count,1000.0,,,,201622.796,262069.491392,11200.0,76600.0,116000.0,202250.0,3000000.0
description,1000.0,1000.0,Between life and death there is a library.When...,1.0,,,,,,,
published_year,1000.0,,,,2022.0,1.414921,2020.0,2021.0,2022.0,2023.0,2024.0
author_first,1000.0,429.0,jennifer,22.0,,,,,,,
source,1000.0,1.0,Goodreads,1000.0,,,,,,,


Fun things to explore
* Most common word in titles
* Title word length trends

In [70]:
df = df_all.copy()

#### Feature Engineering: Get Author's Country

In [71]:
# Function to guess gender based on the pronouns used in the author's bio
def guess_gender(text: str) -> str:
    text = text.lower()
    she = len(re.findall(r'\bshe\b', text))
    he  = len(re.findall(r'\bhe\b', text))
    if she > he:   return "female"
    if he  > she:  return "male"
    return "unknown"

#  Async author‐URL extractor
async def extract_author_url(book_url: str, session: aiohttp.ClientSession) -> str:
    async with session.get(book_url, headers=HEADERS) as resp:
        resp.raise_for_status()
        html = await resp.text()
    soup = BeautifulSoup(html, "html.parser")

    # old layout
    a = soup.select_one("a.authorName")
    if a and a.get("href"):
        return a["href"]

    # new React layout
    a = soup.select_one(".FeaturedPerson__infoPrimary a.ContributorLink")
    if a and a.get("href"):
        return a["href"]

    raise RuntimeError("no author link")


# Async author‐meta fetcher
async def fetch_author_meta(author_url: str, session: aiohttp.ClientSession) -> tuple[str, str]:
    async with session.get(author_url, headers=HEADERS) as resp:
        resp.raise_for_status()
        html = await resp.text()
    soup = BeautifulSoup(html, "html.parser")

    # country
    country = ""
    born_div = soup.find("div", class_="dataTitle", string=re.compile(r"^\s*Born\s*$"))
    if born_div:
        for sib in born_div.next_siblings:
            if isinstance(sib, NavigableString):
                txt = sib.strip()
                if txt:
                    country = txt
                    break
            elif isinstance(sib, Tag):
                txt = sib.get_text(strip=True)
                if txt.lower() != "clear" and txt:
                    country = txt
                    break

    # bio
    bio_container = soup.select_one("div.aboutAuthorInfo") or \
                    soup.find(id=re.compile(r"freeTextContainerauthor"))
    bio_text = bio_container.get_text(" ", strip=True) if bio_container else ""
    gender = guess_gender(bio_text)

    return country, gender


# Orchestrator: for each book URL find its author and meta, caching per author
async def enrich_books_with_authors_async(df: pd.DataFrame) -> pd.DataFrame:
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    author_cache: dict[str, tuple[str,str]] = {}

    async with aiohttp.ClientSession(timeout=TIMEOUT) as session:

        async def handle_book(book_url: str):
            async with sem:
                try:
                    author_url = await extract_author_url(book_url, session)
                    if author_url not in author_cache:
                        author_cache[author_url] = await fetch_author_meta(author_url, session)
                    return author_cache[author_url]
                except Exception:
                    return ("", "unknown")

        # launch one task per book (author fetches will be de-duplicated by cache)
        tasks = [asyncio.create_task(handle_book(url)) for url in df["link"]]
        results = await asyncio.gather(*tasks)

    # unpack into new columns
    countries, genders = zip(*results)
    out = df.copy()
    out["author_country"] = countries
    out["author_gender"]  = genders
    return out


# Usage in notebook
# Assuming `df` is your DataFrame with a "link" column of book URLs:
enriched = await enrich_books_with_authors_async(df)


In [73]:
enriched.drop_duplicates(inplace=True)

✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv


In [76]:
df_enriched = enriched.copy()

In [78]:
df_enriched["author_gender"].value_counts().sort_index()

author_gender
female     597
male       102
unknown    301
Name: count, dtype: int64

In [117]:
#We have 301 authors without gender after scraping goodreads. So we will use an API to guess the author's gender, 
# failing which we will call it unknown/non-binary
# 1) Prepare gender_source col
df_enriched["gender_source"] = df_enriched["author_gender"].apply(
    lambda g: "goodreads" if g in ("male", "female") else "unknown"
)

# 2) Helper to call NamSor
def query_namsor(name: str = "") -> dict:
    """
    Query NamSor with a first & last name.
    Returns JSON with keys 'gender', 'probabilityCalibrated', etc.
    """
    headers = {
        "X-API-KEY": NAMSOR_API_KEY,
        "Accept": "application/json"    
    }
    url = NAMSOR_URL + name
    resp = requests.request("GET", url, headers=headers)
    resp.raise_for_status()
    return resp.json()

# 3) Identify authors needing a NamSor lookup
mask_unknown = df_enriched["author_gender"] == "unknown"
unknown_authors = (
    df_enriched[mask_unknown]
    .loc[:, ["author", "link"]]  
    .drop_duplicates("author")
    .reset_index(drop=True)
)
# 4) Call NamSor for each unique unknown author
namsor_cache = {}
for _, row in unknown_authors.iterrows():
    name = row["author"].strip()
    try:
        result = query_namsor(name=name)
        gender = result.get("likelyGender", "unknown")              # 'male'/'female'/'unknown'
        confidence = result.get("probabilityCalibrated", 0.0)       # between 0 and 1
    except Exception as e:
        print(f"⚠️ NamSor failed for {name!r}: {e}")
        gender, confidence = "unknown", 0.0

    # only trust if above threshold
    if confidence < 0.85:
        gender = "unknown/non-binary"

    namsor_cache[name] = {"gender": gender, "confidence": confidence}
    sleep(0.5)  # respect rate limits

# 5) Apply the NamSor results back to your main df
def fill_from_namsor(row):
    if row["author_gender"] == "unknown":
        meta = namsor_cache.get(row["author"], {"gender":"unknown/non-binary","confidence":0})
        gender = meta["gender"]
        if gender in ("male", "female"):
            row["author_gender"] = gender
            row["gender_source"] = "namsor"
        else:
            # still unknown or low-confidence
            row["author_gender"] = "unknown/non-binary"
            row["gender_source"] = "namsor"
    return row

df_final = df_enriched.apply(fill_from_namsor, axis=1)

# 6) Inspect how many remain unknown
print(df_final["author_gender"].value_counts())
print(df_final["gender_source"].value_counts())


author_gender
female                615
unknown/non-binary    230
male                  155
Name: count, dtype: int64
gender_source
goodreads    699
namsor       301
Name: count, dtype: int64


In [120]:
#Show stats of the unknown authors
df_final[df_final["author_gender"] == "unknown/non-binary"].describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
title,230.0,230.0,The Guest List,1.0,,,,,,,
author,230.0,124.0,Ali Hazelwood,10.0,,,,,,,
link,230.0,230.0,https://www.goodreads.com/book/show/52656911-t...,1.0,,,,,,,
rating,230.0,,,,3.957391,0.272232,3.15,3.75,3.97,4.1575,4.76
ratings_count,230.0,,,,205145.217391,252450.306319,28900.0,76825.0,120000.0,215000.0,2000000.0
description,230.0,230.0,A wedding celebration turns dark and deadly in...,1.0,,,,,,,
published_year,230.0,,,,2022.130435,1.373624,2020.0,2021.0,2022.0,2023.0,2024.0
author_first,230.0,115.0,ali,10.0,,,,,,,
source,230.0,1.0,Goodreads,230.0,,,,,,,
author_country,230.0,36.0,,118.0,,,,,,,


In [122]:
# assume df_final is your DataFrame, with columns:
#   'author'          — the author name
#   'author_gender'   — currently 'male'/'female'/'unknown'
#   'gender_source'   — currently 'goodreads'/'namsor'/'unknown'

# 1) find the unique authors still “unknown”
unknown_authors = sorted(df_final.loc[
    df_final['author_gender']=='unknown/non-binary', 'author'
].unique())

print(f"{len(unknown_authors)} unknown authors to classify.\n")

# 2) walk through each one, ask for input
manual_map = {}
for name in unknown_authors:
    ans = input(f"Enter gender for '{name}' ([m]ale / [f]emale / [n]on-binary): ").strip().lower()
    if ans in ('m','male'):
        manual_map[name] = ('male','manual')
    elif ans in ('f','female'):
        manual_map[name] = ('female','manual')
    else:
        # leave it unknown
        manual_map[name] = ('non-binary','manual')

# 3) apply your manual_map back into df_final
def apply_manual(row):
    if row['author'] in manual_map:
        return pd.Series(manual_map[row['author']], index=['author_gender','gender_source'])
    else:
        return pd.Series([row['author_gender'], row['gender_source']], index=['author_gender','gender_source'])

df_final = df_final.copy()
df_final[['author_gender','gender_source']] = df_final.apply(apply_manual, axis=1)

# 4) quick check
print(df_final['author_gender'].value_counts())
print(df_final['gender_source'].value_counts())

124 unknown authors to classify.



author_gender
female        831
male          165
non-binary      4
Name: count, dtype: int64
gender_source
goodreads    699
manual       230
namsor        71
Name: count, dtype: int64


In [123]:
df_final.to_csv("../data/processed/clean_books_2020_2024.csv", index=False)
print("✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv")

✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv
