In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# URL of the SEP Table of Contents
URL = "https://plato.stanford.edu/contents.html"

def clean_text(text):
    """Removes parentheses and their contents from text, preserving names."""
    return re.sub(r'\s*\([^)]*\)', '', text).strip()

def scrape_structure():
    """Scrapes the SEP ToC while recording every entry, preserving headers and subheaders correctly, including hyperlinked entries."""
    response = requests.get(URL)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    structured_data = []
    content_div = soup.find("div", id="content")

    if content_div:
        current_header = ""
        for li in content_div.find_all("li", recursive=True):  # Capture all list items
            link = li.find("a")
            href = link["href"] if link else ""
            full_link = f"https://plato.stanford.edu{href}" if href else ""
            
            # Extract raw text without subitems, including hyperlinks
            text_parts = [clean_text(x) for x in li.find_all(string=True, recursive=False) if clean_text(x)]
            hyperlink_text = clean_text(link.get_text(strip=True)) if link else ""
            
            text = " ".join(text_parts).strip()
            if hyperlink_text and hyperlink_text not in text:
                text = f"{text} {hyperlink_text}".strip()
            
            # Determine hierarchy level
            level = len(li.find_parents("ul"))
            
            # Track headers properly
            if level == 1:
                current_header = text  # Set new header
                structured_data.append({
                    "title": text,
                    "link": full_link,
                    "level": level
                })
            else:
                structured_data.append({
                    "title": f"{current_header}: {text}",
                    "link": full_link,
                    "level": level
                })
    
    return structured_data


In [51]:

data = scrape_structure()
df = pd.DataFrame(data)
df.drop_duplicates(subset=["title"], keep="first", inplace=True)
df.to_csv("sep_contents.csv", index=False)

print(f"Scraped {len(df)} entries with hierarchy preserved.")
df.head()


Scraped 2636 entries with hierarchy preserved.


Unnamed: 0,title,link,level
0,abduction,https://plato.stanford.eduentries/abduction/,1
1,"Abelard [Abailard], Peter",https://plato.stanford.eduentries/abelard/,1
2,Abhidharma,https://plato.stanford.eduentries/abhidharma/,1
3,abilities,https://plato.stanford.eduentries/abilities/,1
4,Abner of Burgos,https://plato.stanford.eduentries/abner-burgos/,1


In [52]:
import pandas as pd
import csv

# Load the CSV as strings
df = pd.read_csv("sep_contents.csv", dtype=str)

# Manually apply quotes to the first two columns while leaving 'level' unquoted
df["title"] = df["title"].apply(lambda x: f'"{x}"' if not x.startswith('"') else x)
df["link"] = df["link"].apply(lambda x: f'"{x}"' if not x.startswith('"') else x)

# Save the CSV with minimal quoting (ensuring only title and link are quoted)
df.to_csv("sep_contents_fixed.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')

print("Fixed CSV saved as sep_contents_fixed.csv")


Fixed CSV saved as sep_contents_fixed.csv
