In [None]:
df1 = pd.read_csv("../data/raw/ISOTFakeNewsDataset/True.csv")
df1.shape

In [None]:
df2 = pd.read_csv("../data/raw/ISOTFakeNewsDataset/Fake.csv")
df2.shape

In [None]:
import pandas as pd
from newspaper import Article
import concurrent.futures
import requests
import time

# === CONFIG ===
INPUT_FILE = "../data/raw/FakeNewsNet/gossipcop_fake.csv"
OUTPUT_FILE = "../data/processed/gossipcop_sample.csv"
NUM_ARTICLES = 500
MAX_WORKERS = 16   # tune based on CPU / network
TIMEOUT = 10    # max seconds to wait per article

# === Helper to fix URLs ===
def fix_url(url):
    url = str(url).strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    return url


# === Load dataset ===
df = pd.read_csv(INPUT_FILE)
urls = df["news_url"].dropna().head(NUM_ARTICLES).apply(fix_url)


def fetch_article(idx, url):
    """Download + parse an article safely with timeout."""
    try:
        # quick HEAD check so we don’t waste time on dead links
        r = requests.head(url, timeout=5)
        if r.status_code >= 400:
            return None

        article = Article(url)
        article.download()
        article.parse()
        text = article.text.strip()
        if not text:
            return None

        return {"id": idx, "url": url, "title": article.title, "text": text}
    except Exception as e:
        return None


results = []
start = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(fetch_article, idx, url): (idx, url) for idx, url in enumerate(urls, start=1)}
    for future in concurrent.futures.as_completed(futures):
        idx, url = futures[future]
        try:
            result = future.result(timeout=TIMEOUT)
            if result:
                results.append(result)
                print(f"✅ [{idx}] Success: {url}")
            else:
                print(f"⚠️ [{idx}] Empty/failed: {url}")
        except Exception as e:
            print(f"❌ [{idx}] Error: {url} ({e})")

end = time.time()

# === Save results ===
out_df = pd.DataFrame(results)
out_df.to_csv(OUTPUT_FILE, index=False)

print(f"\n✅ Done! Scraped {len(out_df)} / {NUM_ARTICLES} successfully.")
print(f"Saved to {OUTPUT_FILE}")
print(f"⏱️ Took {end - start:.2f} seconds total")


for 8 workers, 5 timeout, 169.22 seconds for 266 / 500 processed successfully.
* Predicted: 24,000 article entries out of 45,000 total from ISOT in around 4 hours.

for 4 workers 5 timeout, 267 / 500, took 241.78

for 16 workers 10 timeout, 265/500, 139.10

seems like timeout isn't making a huge difference within the 5-10 margin, but adding workers does improve time.