In [1]:
!pip install selenium beautifulsoup4 pandas tenacity



In [2]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tenacity import retry, stop_after_attempt, wait_fixed


In [3]:
def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver


In [4]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def get_page_source(driver, url):
    driver.get(url)
    time.sleep(2)
    return driver.page_source

In [5]:
SECTIONS = [
    "https://www.aljazeera.com/news",
    "https://www.aljazeera.com/middle-east",
    "https://www.aljazeera.com/africa",
    "https://www.aljazeera.com/asia",
    "https://www.aljazeera.com/europe",
    "https://www.aljazeera.com/us-canada",
    "https://www.aljazeera.com/business-economy",
    "https://www.aljazeera.com/sports",
    "https://www.aljazeera.com/features",
    "https://www.aljazeera.com/opinions"
]

In [6]:
def extract_article_links(driver, section_url, max_pages=5):
    article_links = set()

    for page in range(1, max_pages + 1):
        page_url = f"{section_url}?page={page}"
        print(f"Visiting: {page_url}")

        try:
            html = get_page_source(driver, page_url)
        except Exception as e:
            print(f"Failed on {page_url}: {e}")
            continue

        soup = BeautifulSoup(html, 'html.parser')
        cards = soup.select('a.u-clickable-card__link')

        if not cards:
            break

        for tag in cards:
            href = tag.get("href")
            if href and href.startswith("/"):
                full_url = "https://www.aljazeera.com" + href
                article_links.add(full_url)

    return list(article_links)


In [7]:
def extract_article_data(driver, url):
    try:
        html = get_page_source(driver, url)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return {
            "Title": "N/A",
            "Url": url,
            "Date": "N/A",
            "Description": "N/A"
        }

    soup = BeautifulSoup(html, "html.parser")

    # Extract title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "N/A"

    # Extract publication date
    date_tag = soup.find("time")
    date = date_tag.get("datetime") if date_tag else "N/A"

    # Extract article content
    body_tag = soup.find("div", class_="wysiwyg")
    if body_tag:
        paragraphs = body_tag.find_all("p")
        description = " ".join(p.get_text(strip=True).replace("\n", " ") for p in paragraphs)
    else:
        description = "N/A"

    return {
        "Title": title.strip(),
        "Url": url.strip(),
        "Date": date.strip(),
        "Description": description.strip()
    }

In [8]:
def save_to_csv(data, filename="aljazeera_articles_fixed.csv"):
    if not data:
        print("No data to save.")
        return

    df = pd.DataFrame(data)

    # Ensure correct column order
    expected_cols = ["Title", "Url", "Date", "Description"]
    for col in expected_cols:
        if col not in df.columns:
            df[col] = "N/A"

    df = df[expected_cols]  # enforce order
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"✅ Saved {len(df)} articles to {filename}")


In [9]:
def run_scraper():
    driver = init_driver()
    all_links = set()

    try:
        for section in SECTIONS:
            links = extract_article_links(driver, section, max_pages=5)
            print(f"✅ {len(links)} articles from {section}")
            all_links.update(links)

        print(f"🔗 Total unique articles found: {len(all_links)}")

        data = []
        for i, url in enumerate(all_links, 1):
            print(f"[{i}/{len(all_links)}] Scraping {url}")
            article = extract_article_data(driver, url)
            if article:
                data.append(article)

        save_to_csv(data)

    finally:
        driver.quit()

In [10]:
run_scraper()

Visiting: https://www.aljazeera.com/news?page=1
Visiting: https://www.aljazeera.com/news?page=2
Visiting: https://www.aljazeera.com/news?page=3
Visiting: https://www.aljazeera.com/news?page=4
Visiting: https://www.aljazeera.com/news?page=5
✅ 20 articles from https://www.aljazeera.com/news
Visiting: https://www.aljazeera.com/middle-east?page=1
Visiting: https://www.aljazeera.com/middle-east?page=2
Visiting: https://www.aljazeera.com/middle-east?page=3
Visiting: https://www.aljazeera.com/middle-east?page=4
Visiting: https://www.aljazeera.com/middle-east?page=5
✅ 18 articles from https://www.aljazeera.com/middle-east
Visiting: https://www.aljazeera.com/africa?page=1
Visiting: https://www.aljazeera.com/africa?page=2
Visiting: https://www.aljazeera.com/africa?page=3
Visiting: https://www.aljazeera.com/africa?page=4
Visiting: https://www.aljazeera.com/africa?page=5
✅ 19 articles from https://www.aljazeera.com/africa
Visiting: https://www.aljazeera.com/asia?page=1
Visiting: https://www.aljaze

[44/126] Scraping https://www.aljazeera.com/economy/2025/7/17/us-house-sends-crypto-genius-act-to-trump-in-win-for-industry-advocates
[45/126] Scraping https://www.aljazeera.com/news/2025/7/18/how-well-did-trump-and-epstein-really-know-each-other-a-timeline
[46/126] Scraping https://www.aljazeera.com/features/2025/7/15/how-will-the-pkks-disarmament-play-out-in-the-region
[47/126] Scraping https://www.aljazeera.com/news/2025/7/17/trump-diagnosed-with-chronic-venous-insufficiency-after-leg-swelling
[48/126] Scraping https://www.aljazeera.com/news/2025/7/17/dozens-killed-in-pakistan-as-heavy-monsoon-season-persists
[49/126] Scraping https://www.aljazeera.com/sports/2025/7/18/indian-state-blames-cricket-team-for-deadly-stampede
[50/126] Scraping https://www.aljazeera.com/sports/2025/7/17/jabuers-tennis-prompted-by-desire-to-rediscover-joy-of-living
[51/126] Scraping https://www.aljazeera.com/news/liveblog/2025/7/18/live-israel-kills-dozens-more-in-gaza-after-3-slain-in-attack-on-church
[52

[110/126] Scraping https://www.aljazeera.com/video/newsfeed/2025/7/18/funeral-for-gaza-church-attack-after-international-condemnation
[111/126] Scraping https://www.aljazeera.com/opinions/2025/7/18/israels-narrative-cannot-survive-the-truth-so-its-silencing-the-world
[112/126] Scraping https://www.aljazeera.com/news/2025/7/18/un-talks-with-cypriot-leaders-fail-to-reach-deal-on-new-border-crossings
[113/126] Scraping https://www.aljazeera.com/features/2025/7/17/how-selling-parrots-to-a-pakistani-journalist-led-to-a-locked-bank-account
[114/126] Scraping https://www.aljazeera.com/economy/2025/7/17/how-zohran-mamdani-reached-a-multilingual-multicultural-new-york
[115/126] Scraping https://www.aljazeera.com/video/pinch-point/2025/7/14/trumps-push-for-peace-in-the-drc-2
[116/126] Scraping https://www.aljazeera.com/news/2025/7/15/un-rapporteur-demands-global-action-to-stop-israels-genocide-in-gaza
[117/126] Scraping https://www.aljazeera.com/news/2025/7/17/slovenia-bars-far-right-israeli-cab