In [None]:
import requests
from bs4 import BeautifulSoup
import re
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# Step 1: Scrape chapters from Fandom
# -----------------------------
url = "https://harrypotter.fandom.com/wiki/Harry_Potter_and_the_Deathly_Hallows"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
content_div = soup.find("div", class_="mw-parser-output")

chapter_splits = []
current_chapter = None
in_plot_section = False

for elem in content_div.children:
    if elem.name == "h2":  # top-level heading
        heading_text = elem.get_text().strip()
        if "Plot" in heading_text:
            in_plot_section = True
            continue
        elif in_plot_section:
            # reached a new h2 after plot => stop
            break
    if not in_plot_section:
        continue

    if elem.name in ["h3", "h4"]:  # chapter headings
        text = elem.get_text().strip()
        if "Chapter" in text or "Epilogue" in text:
            if current_chapter:
                chapter_splits.append(current_chapter)
            current_chapter = text + "\n"
    elif elem.name == "p" and current_chapter:
        current_chapter += elem.get_text().strip() + "\n\n"

# Append last chapter
if current_chapter:
    chapter_splits.append(current_chapter)

print(f"Found {len(chapter_splits)} chapters. Example:")
print(chapter_splits[0][:500])

In [None]:
# -----------------------------
# Step 2: Setup Hugging Face classifier
# -----------------------------
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)  # GPU
candidate_labels = ["dark", "hope"]

In [None]:
# -----------------------------
# Step 3: Helper: chunk text
# -----------------------------
def chunk_text(text, max_len=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_len):
        chunks.append(" ".join(words[i:i+max_len]))
    return chunks

In [None]:
# -----------------------------
# Step 4: Analyze darkness per chapter
# -----------------------------
chapter_scores = []

for chap_idx, chap in enumerate(chapter_splits, start=1):
    title = chap.split("\n")[0]
    # clean brackets / footnotes
    title = re.sub(r'\[\d*\]', '', title)

    text = chap
    print(f"Processing {title} ({chap_idx}/{len(chapter_splits)})...")

    chunks = chunk_text(text, max_len=500)

    chunk_scores = []
    for chunk in chunks:
        result = classifier(chunk, candidate_labels)
        score_dark = result["scores"][result["labels"].index("dark")]
        score_hope = result["scores"][result["labels"].index("hope")]
        darkness_score = score_dark - score_hope
        chunk_scores.append(darkness_score)

    avg_score = sum(chunk_scores) / len(chunk_scores)
    chapter_scores.append({"Chapter": title, "Darkness": avg_score})

In [None]:
# -----------------------------
# Step 5: Create DataFrame
# -----------------------------
df = pd.DataFrame(chapter_scores)
df["Chapter_Num"] = df.index + 1  # preserve original order

In [None]:
# -----------------------------
# Step 6: Plot darkness over chapters
# -----------------------------
plt.figure(figsize=(16,6))
bars = plt.bar(df["Chapter_Num"], df["Darkness"], color=plt.cm.viridis((df["Darkness"] - df["Darkness"].min()) / (df["Darkness"].max() - df["Darkness"].min())))
plt.xticks(df["Chapter_Num"], df["Chapter"], rotation=90)
plt.xlabel("Chapter")
plt.ylabel("Darkness Score (dark vs hope)")
plt.title("Darkness Score by Chapter - Deathly Hallows")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# -----------------------------
# Optional: Sort by darkness
# -----------------------------
df_sorted = df.sort_values("Darkness", ascending=False).reset_index(drop=True)
print(df_sorted.head(10))