Sample testing for one page

In [None]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the article
url = 'https://plato.stanford.edu/entries/aristotle/'

# Send a GET request to fetch the HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize a dictionary to hold the sections
article_content = {}

# Find the main content division
main_content = soup.find('div', {'id': 'main-text'})

# Initialize variables to track the current section
current_section = None
current_text = []

# Iterate through the elements in the main content
for element in main_content.find_all(['h2', 'h3', 'p', 'ul', 'ol'], recursive=False):
    if element.name in ['h2', 'h3']:
        # If there's an existing section being processed, save its content
        if current_section:
            article_content[current_section] = ' '.join(current_text).strip()
            current_text = []
        # Start a new section
        current_section = element.get_text().strip()
    elif element.name in ['p', 'ul', 'ol'] and current_section:
        # Append paragraph or list items to the current section's text
        current_text.append(element.get_text().strip())

# Add the last section to the dictionary
if current_section and current_text:
    article_content[current_section] = ' '.join(current_text).strip()

# Convert the dictionary to a JSON object
json_output = json.dumps(article_content, indent=4, ensure_ascii=False)

# Save the JSON object to a file
with open('aristotle_article.json', 'w', encoding='utf-8') as file:
    file.write(json_output)

print("Scraping complete. The article content has been saved to 'abduction_article.json'.")


Scraping complete. The article content has been saved to 'abduction_article.json'.


Fixing CSV error

In [6]:
import csv

# File paths
csv_file = "/Users/rohansharma/Desktop/Code/philRAG/SEP_collection_cleaning/sep_contents_fixed.csv"

# Read and update the CSV file
fixed_rows = []
with open(csv_file, "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        row["link"] = row["link"].replace("eduentries", "edu/entries")  # Fix link
        fixed_rows.append(row)

# Write the corrected CSV back to the original file
with open(csv_file, "w", encoding="utf-8", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["title", "link", "level"])
    writer.writeheader()
    writer.writerows(fixed_rows)

print(f"✅ Fixed hyperlinks in {csv_file}")


✅ Fixed hyperlinks in /Users/rohansharma/Desktop/Code/philRAG/SEP_collection_cleaning/sep_contents_fixed.csv


Getting JSONs for all entries

In [8]:
import os
import csv
import json
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm

# Paths
csv_file = "/Users/rohansharma/Desktop/Code/philRAG/SEP_collection_cleaning/sep_contents_fixed.csv"
output_dir = "/Volumes/BigDrive/phil_rag"
os.makedirs(output_dir, exist_ok=True)

# Read CSV file
entries = []
with open(csv_file, "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        entries.append(row)

# Function to clean and format filenames
def sanitize_filename(title):
    return title.lower().replace(" ", "_").replace("[", "").replace("]", "").replace(",", "").replace("'", "").replace("/", "_")

# Function to scrape article with structured sections
def scrape_article(title, link):
    try:
        # Ensure link is properly formatted
        if not link.startswith("https://plato.stanford.edu/entries/"):
            print(f"Skipping invalid link: {link}")
            return None

        # Fetch page
        response = requests.get(link, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {title}: {response.status_code}")
            return None
        
        # Parse HTML
        soup = BeautifulSoup(response.content, "html.parser")
        main_content = soup.find("div", {"id": "main-text"})
        if not main_content:
            print(f"No main content found for {title}")
            return None

        # Extract structured sections
        article_content = {}
        current_section = None
        current_text = []

        for element in main_content.find_all(['h2', 'h3', 'p', 'ul', 'ol'], recursive=False):
            if element.name in ['h2', 'h3']:
                # Save the previous section
                if current_section:
                    article_content[current_section] = "\n\n".join(current_text).strip()
                # Start a new section
                current_section = element.get_text().strip()
                current_text = []
            elif element.name in ['p', 'ul', 'ol'] and current_section:
                current_text.append(element.get_text().strip())

        # Save the last section
        if current_section and current_text:
            article_content[current_section] = "\n\n".join(current_text).strip()

        # Structure data
        return {
            "title": title,
            "link": link,
            "level": 1,  # Adjust if levels vary
            "sections": article_content  # Structured content by section
        }
    
    except Exception as e:
        print(f"Error scraping {title}: {str(e)}")
        return None

# Process each entry and overwrite old files
for entry in tqdm(entries, desc="Scraping SEP Articles"):
    title = entry["title"]
    link = entry["link"]
    
    # Scrape content
    article_data = scrape_article(title, link)
    if article_data:
        # Save JSON file
        filename = f"{sanitize_filename(title)}.json"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, "w", encoding="utf-8") as file:
            json.dump(article_data, file, indent=4, ensure_ascii=False)
        
        # Sleep to avoid rate limiting
        sleep(1)  # Adjust if necessary

print(f"✅ All articles saved in {output_dir}")


Scraping SEP Articles:  81%|████████  | 2139/2636 [51:20<11:44,  1.42s/it] 

Skipping invalid link: https://plato.stanford.edu#


Scraping SEP Articles: 100%|██████████| 2636/2636 [1:03:12<00:00,  1.44s/it]

✅ All articles saved in /Volumes/BigDrive/phil_rag



