# 📘 Notebook 01: Data Collection
# Emotional Geography of Books – Shraddha

In [1]:
import requests
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
from time import sleep

In [8]:
#Read data from raw data
files = list(Path("../data/raw").glob("goodreads_books_*.csv"))
all_dfs = []

for file in files:
    year = int(file.stem.split("_")[-1])
    df = pd.read_csv(file)
    df["published_year"] = year
    all_dfs.append(df)

df_all = pd.concat(all_dfs, ignore_index=True)
print(f"📚 Total books: {len(df_all)}")

📚 Total books: 1015


In [11]:
# Remove " ratings" text and convert to int
def clean_ratings_count(value):
    if pd.isna(value):
        return 0
    value = value.lower().replace("ratings", "").strip()
    multipliers = {"k": 1_000, "m": 1_000_000}
    for suffix, multiplier in multipliers.items():
        if value.endswith(suffix):
            return int(float(value[:-1]) * multiplier)
    try:
        return int(value.replace(",", ""))
    except:
        return 0

df_all["ratings_count"] = df_all["ratings_count"].apply(clean_ratings_count)

# Ratings to float
df_all["rating"] = pd.to_numeric(df_all["rating"], errors="coerce")

# Author name cleanup + first name
df_all["author"] = df_all["author"].fillna("").str.strip()
df_all["author_first"] = df_all["author"].apply(lambda x: x.split()[0].lower() if x else "")

In [12]:
def guess_gender(name):
    male = {"john", "david", "matt", "kevin", "daniel"}
    female = {"sarah", "emma", "hannah", "shraddha", "emily"}
    
    if name in male:
        return "male"
    elif name in female:
        return "female"
    else:
        return "unknown"

df_all["author_gender"] = df_all["author_first"].apply(guess_gender)

In [14]:
df_all["source"] = "Goodreads"
df_all.to_csv("../data/processed/clean_books_2020_2024.csv", index=False)
print("✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv")

✅ Saved cleaned data to data/processed/clean_books_2020_2024.csv
