### Web scraping module

In [1]:
!pip3 install requests bs4

Defaulting to user installation because normal site-packages is not writeable
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [None]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re
import os

def scrape_and_save_json(url,  output_dir="knowledge_base"):
    """
    Scrapes a webpage, extracts content, and saves as a JSON knowledge base file.
    Each website will have its own JSON file.
    """

    os.makedirs(output_dir, exist_ok=True)

    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)

        if response.status_code != 200:
            print(f"❌ Failed to fetch {url}, status {response.status_code}")
            return

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract metadata
        title = soup.title.string.strip() if soup.title else "PLACEHOLDER_TITLE"

        # Try author from meta
        author = None
        author_meta = soup.find("meta", {"name": "author"})
        if author_meta and author_meta.get("content"):
            author = author_meta["content"].strip()
        else:
            author = "PLACEHOLDER_AUTHOR"

        # Try publish date
        date_meta = soup.find("meta", {"property": "article:published_time"}) or \
                    soup.find("meta", {"name": "date"})
        date_published = date_meta["content"] if date_meta and date_meta.get("content") else "PLACEHOLDER_DATE"

        # Try category from URL path
        category_match = re.search(r"/category/([^/]+)/", url)
        category = category_match.group(1) if category_match else "PLACEHOLDER_CATEGORY"

        # Extract text
        paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
        content = "\n".join(paragraphs) if paragraphs else "PLACEHOLDER_CONTENT"
        website_name =re.sub(r"https?://(www\.)?", "", url).split("/")[0].replace("-", " ").title()
        # Build JSON structure
        kb_entry = {
            "website": {
                "name": website_name,
                "url": re.match(r"https?://[^/]+", url).group(0)
            },
            "documents": [
                {
                    "id": "doc_001",
                    "url": url,
                    "content": content,
                    "chunks": [
                        {
                            "chunk_id": "chunk_001",
                            "text": content,
                            "embedding": [0.0, 0.0, 0.0],
                            "metadata": {
                                "title": title,
                                "author": author,
                                "date_published": date_published,
                                "date_collected": datetime.now().strftime("%Y-%m-%d"),
                                "category": category,
                                "source": url
                            }
                        }
                    ]
                }
            ]
        }

        # Save to JSON
        filename = os.path.join(output_dir, f"{website_name.lower().replace(' ', '_')}.json")
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(kb_entry, f, ensure_ascii=False, indent=4)

        print(f"✅ Saved JSON knowledge base for {website_name} → {filename}")

    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching {url}: {e}")


In [20]:
scrape_and_save_json("https://www.efsa.europa.eu/en/efsajournal/pub/2980")

✅ Saved JSON knowledge base for Efsa.Europa.Eu → knowledge_base/efsa.europa.eu.json
