# 📘 Notebook 01: Data Collection
# Emotional Geography of Books – Shraddha

Notebook 01: Data Collection
=============================
This notebook collects and processes data related to books and authors.
It fetches author metadata, extracts country and gender information,
and enriches the author data with this additional information.

In [39]:
import requests
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
from time import sleep
from bs4 import BeautifulSoup, NavigableString, Tag
from typing import Tuple

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}



In [41]:
#Read data from raw data
files = list(Path("../data/raw").glob("goodreads_books_*.csv"))
all_dfs = []

for file in files:
    year = int(file.stem.split("_")[-1])
    df = pd.read_csv(file)
    df["published_year"] = year
    all_dfs.append(df)

df_all = pd.concat(all_dfs, ignore_index=True)
print(f"📚 Total books: {len(df_all)}")

📚 Total books: 1015


In [42]:
# Remove " ratings" text and convert to int
def clean_ratings_count(value):
    if pd.isna(value):
        return 0
    value = value.lower().replace("ratings", "").strip()
    multipliers = {"k": 1_000, "m": 1_000_000}
    for suffix, multiplier in multipliers.items():
        if value.endswith(suffix):
            return int(float(value[:-1]) * multiplier)
    try:
        return int(value.replace(",", ""))
    except:
        return 0

df_all["ratings_count"] = df_all["ratings_count"].apply(clean_ratings_count)

# Ratings to float
df_all["rating"] = pd.to_numeric(df_all["rating"], errors="coerce")

# Author name cleanup + first name
df_all["author"] = df_all["author"].fillna("").str.strip()
df_all["author_first"] = df_all["author"].apply(lambda x: x.split()[0].lower() if x else "")

In [43]:
df_all["source"] = "Goodreads"
df_all.to_csv("../data/processed/clean_books_2020_2024.csv", index=False)
print("✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv")

✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv


### Basic EDA

In [7]:
#Read data from /data/processed/clean_books_2020_2024.csv
df_all = pd.read_csv("../data/processed/clean_books_2020_2024.csv")

In [8]:
df_all.sample(10)

Unnamed: 0,title,author,link,rating,ratings_count,description,published_year,author_first,source
513,The Rachel Incident,Caroline O'Donoghue,https://www.goodreads.com/book/show/63094957-t...,4.08,112000,"A brilliantly funny novel about friends, lover...",2023,caroline,Goodreads
551,"Heat of the Everflame (Kindred's Curse, #3)",Penn Cole,https://www.goodreads.com/book/show/124936017-...,4.27,111000,The war has begun. Both sides demand Diem’s al...,2023,penn,Goodreads
211,Better Than the Movies (Better Than the Movies...,Lynn Painter,https://www.goodreads.com/book/show/55710822-b...,4.27,613000,"In this rom-com about rom-coms, in the spirit ...",2021,lynn,Goodreads
745,Family of Liars,E. Lockhart,https://www.goodreads.com/book/show/59586261-f...,3.77,113000,The prequel to We Were Liars takes readers bac...,2022,e.,Goodreads
692,"What Moves the Dead (Sworn Soldier, #1)",T. Kingfisher,https://www.goodreads.com/book/show/58724626-w...,3.86,111000,"When Alex Easton, a retired soldier, receives ...",2022,t.,Goodreads
1014,You Are Here,David Nicholls,https://www.goodreads.com/book/show/201465867-...,4.01,53500,Sometimes you need to get lost to find your wa...,2024,david,Goodreads
851,We Used to Live Here,Marcus Kliewer,https://www.goodreads.com/book/show/199798006-...,3.68,143000,From an author “destined to become a titan of ...,2024,marcus,Goodreads
614,A Fever in the Heartland: The Ku Klux Klan's P...,Timothy Egan,https://www.goodreads.com/book/show/61423989-a...,4.36,40500,A historical thriller by the Pulitzer and Nati...,2023,timothy,Goodreads
452,You Shouldn't Have Come Here,Jeneva Rose,https://www.goodreads.com/book/show/61458888-y...,3.35,230000,"Grace Evans, an overworked New Yorker looking ...",2023,jeneva,Goodreads
864,How to Solve Your Own Murder (Castle Knoll Fil...,Kristen Perrin,https://www.goodreads.com/book/show/181350367-...,3.79,80400,For fans of Knives Out and The Thursday Murder...,2024,kristen,Goodreads


In [9]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1015 entries, 0 to 1014
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1015 non-null   object 
 1   author          1015 non-null   object 
 2   link            1015 non-null   object 
 3   rating          1015 non-null   float64
 4   ratings_count   1015 non-null   int64  
 5   description     1015 non-null   object 
 6   published_year  1015 non-null   int64  
 7   author_first    1015 non-null   object 
 8   source          1015 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 71.5+ KB


In [10]:
df_all.describe(include="all")

Unnamed: 0,title,author,link,rating,ratings_count,description,published_year,author_first,source
count,1015,1015,1015,1015.0,1015.0,1015,1015.0,1015,1015
unique,998,600,1000,,,1000,,429,1
top,"Unfortunately Yours (A Vine Mess, #2)",Freida McFadden,https://www.goodreads.com/book/show/124936017-...,,,The war has begun. Both sides demand Diem’s al...,,jennifer,Goodreads
freq,2,18,2,,,2,,22,1015
mean,,,,3.981429,199946.8,,2022.014778,,
std,,,,0.275918,260503.0,,1.409596,,
min,,,,2.79,11200.0,,2020.0,,
25%,,,,3.79,76600.0,,2021.0,,
50%,,,,4.0,115000.0,,2022.0,,
75%,,,,4.17,200500.0,,2023.0,,


In [11]:
#Count number of rows per published_year
df_all["published_year"].value_counts()

published_year
2023    215
2020    200
2021    200
2022    200
2024    200
Name: count, dtype: int64

In [12]:
df_all.duplicated().sum()

15

In [13]:
#Remove duplicates
df_all.drop_duplicates(inplace=True)

In [14]:
df_all.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
title,1000.0,998.0,One by One,2.0,,,,,,,
author,1000.0,600.0,Freida McFadden,18.0,,,,,,,
link,1000.0,1000.0,https://www.goodreads.com/book/show/52578297-t...,1.0,,,,,,,
rating,1000.0,,,,3.98327,0.275731,2.79,3.79,4.0,4.17,4.76
ratings_count,1000.0,,,,201622.796,262069.491392,11200.0,76600.0,116000.0,202250.0,3000000.0
description,1000.0,1000.0,Between life and death there is a library.When...,1.0,,,,,,,
published_year,1000.0,,,,2022.0,1.414921,2020.0,2021.0,2022.0,2023.0,2024.0
author_first,1000.0,429.0,jennifer,22.0,,,,,,,
source,1000.0,1.0,Goodreads,1000.0,,,,,,,


Fun things to explore
* Most common word in titles
* Title word length trends

In [18]:
df = df_all.copy()

#### Feature Engineering: Get Author's Country

In [19]:
df_try = df_all.head(5)

In [None]:
# Function to guess gender based on the pronouns used in the author's bio
def guess_gender(text: str) -> str:
    text = text.lower()
    she = len(re.findall(r'\bshe\b', text))
    he  = len(re.findall(r'\bhe\b', text))
    if she > he:   return "female"
    if he  > she:  return "male"
    return "unknown"

# Function to extract author URL from book page
def extract_author_url_from_book(book_url: str, session: requests.Session) -> str:
    """
    Load a book page and find the author-show link. Supports both the old
    <a class="authorName" …> and the new React <a class="ContributorLink" …>.
    """
    r = session.get(book_url, headers=HEADERS, timeout=10)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Try old Goodreads layout first
    a = soup.select_one("a.authorName")
    if a and a.get("href"):
        author_url = a["href"]
    else:
        # Fallback to new React layout
        a = soup.select_one(".FeaturedPerson__infoPrimary a.ContributorLink")
        if not (a and a.get("href")):
            raise RuntimeError("Could not find author link on book page")
        author_url = a["href"]

    return author_url

# Funtion to extract author country and bio from the author page
def fetch_author_meta(author_url: str, session: requests.Session) -> tuple[str, str]:
    """Fetch author page, return (country, gender)."""
    r = session.get(author_url, headers=HEADERS, timeout=10)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # -- country --
    country = ""
    born_div = soup.find("div", class_="dataTitle", text=re.compile(r"^\s*Born\s*$"))
    if born_div:
        for sib in born_div.next_siblings:
            if isinstance(sib, NavigableString):
                txt = sib.strip()
                if txt:
                    country = txt
                    break
            if isinstance(sib, Tag):
                txt = sib.get_text(strip=True)
                if txt and txt.lower() != "clear":
                    country = txt
                    break

    # -- bio --
    bio_container = (
        soup.select_one("div.aboutAuthorInfo")
        or soup.find(id=re.compile(r"freeTextContainerauthor"))
    )
    bio_text = bio_container.get_text(" ", strip=True) if bio_container else ""
    # -- gender guess --
    gender = guess_gender(bio_text)

    return country, gender

# Function to enrich df with author country and gender
def enrich_books_with_authors(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a df with a column 'link' pointing to the *book* page,
    scrapes each book to find its author page, then fetches country+gender.
    Caches by author_url so we only call each author once.
    """
    session = requests.Session()
    cache: dict[str, tuple[str,str]] = {}

    author_countries, author_genders = [], []

    for book_url in df["link"]:
        try:
            author_url = extract_author_url_from_book(book_url, session)
            if author_url not in cache:
                cache[author_url] = fetch_author_meta(author_url, session)
            country, gender = cache[author_url]
        except Exception as e:
            print(f"⚠️ failed for book {book_url}: {e}")
            country, gender = "", "unknown"

        author_countries.append(country)
        author_genders.append(gender)

    out = df.copy()
    out["author_country"] = author_countries
    out["author_gender"]  = author_genders
    return out

# Usage:
df_all = enrich_books_with_authors(df_all)

  born_div = soup.find("div", class_="dataTitle", text=re.compile(r"^\s*Born\s*$"))
