In [2]:
!pip install newspaper3k pandas requests beautifulsoup4 pdfplumber lxml
!pip install lxml_html_clean

Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Using cached feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Using cached feedfinder2-0.0.4-py3-none-any.whl
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Using cached jieba3k-0.35.1-py3-none-any.whl
Collecting tinysegmenter==0.3 (from newspaper3k)
  Using cached tinysegmenter-0.3-py3-none-any.whl
Collecting pdfminer.six==20251230 (from pdfplumber)
  Using cached pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-5.3.0-py3-none-win_amd64.whl.metadata (67 kB)
Collecting sgmllib3k (from feedparser>=5.2.1->newspaper3k)
  Using cached sgmllib3k-1.0.0-py3-none-any.whl
Using cached newspaper3k-0.2.8-py3-none-any.whl (211 kB)
Using ca

In [3]:
import requests
import pandas as pd
import uuid
import datetime
import re
from bs4 import BeautifulSoup
from newspaper import Article
from urllib.parse import urljoin, urlparse
from io import BytesIO
import pdfplumber

def preprocess_articles_to_csv(listing_url, csv_filename="articles.csv", same_domain_only=True, min_text_length=200):


    try:
        response = requests.get(listing_url, timeout=10)
        response.raise_for_status()
    except Exception:
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    base_domain = urlparse(listing_url).netloc
    urls = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        full_url = urljoin(listing_url, href)
        parsed = urlparse(full_url)

        if not parsed.scheme.startswith("http"):
            continue
        if same_domain_only and parsed.netloc != base_domain:
            continue
        if re.search(r"(login|signup|about|contact|privacy|terms)", full_url, re.I):
            continue
        urls.add(full_url)

    urls = list(urls)
    if not urls:
        return None

    records = []
    for url in urls:
        try:
            text = ""
            timestamp = None

            if url.lower().endswith(".pdf"):
                r = requests.get(url, timeout=10)
                r.raise_for_status()
                with pdfplumber.open(BytesIO(r.content)) as pdf:
                    for page in pdf.pages:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
                timestamp = datetime.datetime.now()
            else:
                article = Article(url)
                article.download()
                article.parse()
                text = article.text
                timestamp = article.publish_date

            text = text.strip()
            if not text or len(text) < min_text_length:
                continue

            records.append({
                "id": str(uuid.uuid4()),
                "text": text,
                "timestamp": timestamp or datetime.datetime.now(),
                "source_type": "Government" if ".gov" in url else "News",
                "domain": urlparse(url).netloc,
                "label_truth": None
            })

        except Exception:
            continue

    if records:
        df = pd.DataFrame(records)
        df.to_csv(csv_filename, index=False)
        return csv_filename
    return None
