<a href="https://colab.research.google.com/github/softcruder/Undergraduate-Project-Work/blob/master/scrapper_agent_(v4_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### Mount Google Drive for persistent storage

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Prelim
Harvests fact‐checks from Dubawa, Africa Check-NG and The FactCheckHub.

Outputs
1. JSONL and
2. CSV with fields: *id, claim_text, verdict, claim_language, claim_links, claim_platforms, metadata, source_url, platform*

**© 2025 research use – respect robots.txt**

In [None]:
!pip install lingua-language-detector langdetect

Collecting lingua-language-detector
  Downloading lingua_language_detector-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading lingua_language_detector-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (96.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=a24ebf9e2d3cb552990c7920df8ef1da77b959b7bbf4584d257ecb64f881496c
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1

In [None]:
# --- Imports and Setup ---
import csv, json, re, time, uuid, itertools, os, tempfile, shutil, pathlib
import datetime # Import the whole datetime module
from pathlib import Path
from urllib.parse import unquote, urlparse, urljoin
from urllib.robotparser import RobotFileParser
import requests
from bs4 import BeautifulSoup

# Import from both language libraries
from lingua import LanguageDetectorBuilder, Language
from langdetect import detect, DetectorFactory

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

Setting up header and `detector` build

In [None]:
# --- Build the Primary, High-Accuracy Detector (lingua-py) ---
# We configure it ONLY for the languages it supports to use it as a specialist.
detector = (
    LanguageDetectorBuilder.from_languages(
        Language.ENGLISH, Language.YORUBA
    )
    .with_preloaded_language_models()
    .build()
)

print("Primary detector (lingua-py) for English and Yoruba is ready.")
print("Fallback detector (langdetect) for other languages is also ready.")

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/122.0 Safari/537.36")
}

# --- Define a persistent path in Google Drive ---
# This creates a folder named "FactCheckScraper" in the main Drive folder.
PERSISTENT_PATH = pathlib.Path("/content/drive/MyDrive/FactCheckScraper-v4.1")
PERSISTENT_PATH.mkdir(parents=True, exist_ok=True)

# Update CACHE_PATH to point to Google Drive
CACHE_PATH = PERSISTENT_PATH / "crawl_state.json"


def _atomic_write(data: dict, path: pathlib.Path=CACHE_PATH):
    # This function will use the new CACHE_PATH
    tmp = tempfile.NamedTemporaryFile("w", delete=False, dir=path.parent)
    try:
        json.dump(data, tmp, indent=2, ensure_ascii=False)
        tmp.flush(); os.fsync(tmp.fileno())
        tmp.close()
        os.replace(tmp.name, path)
    finally:
        if os.path.exists(tmp.name):
            os.unlink(tmp.name)

Primary detector (lingua-py) for English and Yoruba is ready.
Fallback detector (langdetect) for other languages is also ready.


In [None]:
robots_cache = {}

def allowed(url: str, ua="FactCrawler") -> bool:
    """
    Enhanced robots.txt checker that works around urllib.robotparser bugs
    """
    # Extract host from URL
    host = '/'.join(url.split('/')[:3])
    robots_url = host + "/robots.txt"

    # Check cache (refresh every hour)
    cache_entry = robots_cache.get(host)
    current_time = time.time()

    if cache_entry is None or current_time - cache_entry.get('timestamp', 0) > 3600:
        print(f"[ROBOTS] Fetching robots.txt for {host}")

        # Method 1: Try urllib.robotparser first
        rp = RobotFileParser()
        robotparser_result = None

        try:
            rp.set_url(robots_url)
            rp.read()
            robotparser_result = rp.can_fetch(ua, url)
            print(f"[ROBOTS] RobotFileParser result: {robotparser_result}")
        except Exception as e:
            print(f"[ROBOTS] RobotFileParser failed: {e}")
            robotparser_result = None

        # Method 2: Manual parsing as fallback (more reliable)
        manual_result = None
        try:
            response = requests.get(robots_url, timeout=10, headers={'User-Agent': ua})
            if response.status_code == 200:
                manual_result = _manual_robots_check(response.text, url, ua)
                print(f"[ROBOTS] Manual parsing result: {manual_result}")
            else:
                manual_result = True  # No robots.txt = allow all
        except Exception as e:
            print(f"[ROBOTS] Manual robots check failed: {e}")
            manual_result = True  # Fail open

        # Use manual result if robotparser seems buggy
        if robotparser_result is False and manual_result is True:
            print(f"[ROBOTS] RobotFileParser seems buggy, using manual result")
            final_result = manual_result
        elif manual_result is not None:
            final_result = manual_result
        elif robotparser_result is not None:
            final_result = robotparser_result
        else:
            final_result = True  # Fail open

        # Cache the result
        robots_cache[host] = {
            'parser': rp if robotparser_result is not None else None,
            'allowed': final_result,
            'timestamp': current_time,
            'delay': _extract_crawl_delay(response.text if 'response' in locals() else '', ua)
        }

        return final_result

    else:
        print(f"[ROBOTS] Using cached result for {host}")
        return cache_entry['allowed']

def _manual_robots_check(robots_content: str, url: str, ua: str) -> bool:
    """
    Manual robots.txt parsing to work around urllib.robotparser bugs
    """
    lines = robots_content.strip().split('\n')
    current_ua = None
    ua_lower = ua.lower()
    url_path = '/' + '/'.join(url.split('/')[3:])  # Extract path from URL

    applicable_rules = []

    for line in lines:
        line = line.strip()
        if not line or line.startswith('#'):
            continue

        if line.lower().startswith('user-agent:'):
            current_ua = line.split(':', 1)[1].strip().lower()
            continue

        # Check if this rule applies to our user agent
        if current_ua and (current_ua == '*' or current_ua == ua_lower):
            if line.lower().startswith('disallow:'):
                path = line.split(':', 1)[1].strip()
                if path and url_path.startswith(path):
                    applicable_rules.append(('disallow', path))

            elif line.lower().startswith('allow:'):
                path = line.split(':', 1)[1].strip()
                if path and url_path.startswith(path):
                    applicable_rules.append(('allow', path))

    # Process rules: more specific (longer) paths take precedence
    applicable_rules.sort(key=lambda x: len(x[1]), reverse=True)

    for rule_type, path in applicable_rules:
        if rule_type == 'disallow':
            return False
        elif rule_type == 'allow':
            return True

    # Default: allow if no matching disallow rule
    return True

def _extract_crawl_delay(robots_content: str, ua: str) -> float:
    """Extract crawl-delay from robots.txt"""
    lines = robots_content.strip().split('\n')
    current_ua = None
    ua_lower = ua.lower()

    for line in lines:
        line = line.strip()
        if line.lower().startswith('user-agent:'):
            current_ua = line.split(':', 1)[1].strip().lower()
        elif current_ua and (current_ua == '*' or current_ua == ua_lower):
            if line.lower().startswith('crawl-delay:'):
                try:
                    return float(line.split(':', 1)[1].strip())
                except ValueError:
                    pass
    return 10.0  # Default delay

def crawl_delay_for(url: str) -> float:
    """Get the crawl delay for a given URL's host"""
    host = '/'.join(url.split('/')[:3])
    cache_entry = robots_cache.get(host, {})
    return cache_entry.get('delay', 10.0)


### Utilities
Shared utils to achieve the aim of this script.

#### General

In [None]:
def load_cache() -> dict:
    if not CACHE_PATH.exists():
        return {"crawled": [], "failed": {}, "iter_state": {}, "lrl_iter_state": {}, "updated_at": None}
    with open(CACHE_PATH, "r", encoding="utf-8") as fh:
        return json.load(fh)

def save_cache(cache: dict):
    cache["updated_at"] = datetime.datetime.now(datetime.UTC).isoformat()
    _atomic_write(cache)

def get_soup(url: str, max_retries: int = 3, backoff_factor: float = 2.0) -> BeautifulSoup:
    # Check robots.txt first
    if not allowed(url):
        print(f"[SKIP] Blocked by robots.txt: {url}")
        raise requests.exceptions.RequestException(f"Blocked by robots.txt: {url}")

    # Rate limiting with appropriate crawl-delay
    original_delay = crawl_delay_for(url)
    if original_delay > 10:
        delay = original_delay / 4
        print(f"[DELAY] Original delay ({original_delay:.1f}s) is over 10s. Waiting for delay: {delay:.1f}s")
    else:
        delay = original_delay
        print(f"[DELAY] Original delay is {original_delay:.1f}s. Waiting for full duration.")
    time.sleep(delay)


    # Fetches the content of a given URL and parses it using BeautifulSoup.
    # Includes retry logic for transient errors like 503.
    print(f"Attempting to fetch: {url}")
    for attempt in range(max_retries):
        try:
            res = requests.get(url, headers=HEADERS, timeout=15)
            if res.status_code == 429:  # Too Many Requests
                if attempt < max_retries - 1:
                    wait_time = (backoff_factor ** attempt) * delay
                    print(f"[WARN] Rate limited for {url}. Retrying in {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                    continue
            res.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            print(f"Successfully fetched: {url}")
            print(f"Response length: {len(res.text)}")
            return BeautifulSoup(res.text, "html.parser")
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 503 and attempt < max_retries - 1:
                wait_time = backoff_factor ** attempt
                print(f"[WARN] Received 503 for {url}. Attempt {attempt + 1}/{max_retries}. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                print(f"[ERROR] Failed to fetch {url} after retries: {e}")
                raise
        except requests.exceptions.RequestException as e:
            # Catches any other requests-related errors (e.g., connection errors, timeouts)
            print(f"[ERROR] Failed to fetch {url}: {e}")
            raise
    # This part should theoretically not be reached if max_retries > 0 and no exception was raised
    raise requests.exceptions.RequestException(f"Failed to fetch {url} after {max_retries} attempts.")


def safe_detect(text: str, hint: str) -> str:
    """
    Detects language using a hybrid approach with a confidence threshold.
    1. Uses 'lingua-py' for EN/YO. If confidence is high (>= 0.85), returns the result.
    2. Otherwise, falls back to the broader 'langdetect' for other languages (especially HA, IG).
    """
    if not text or len(text.strip()) < 15:
        return "unknown"

    try:
        # --- Get confidence values instead of a direct result ---
        confidence_values = detector.compute_language_confidence_values(text)

        if confidence_values:
            top_language, confidence = confidence_values[0]

            # Set a high bar for confidence to avoid misclassification of HA/IG as YO.
            CONFIDENCE_THRESHOLD = 0.85

            if confidence >= CONFIDENCE_THRESHOLD:
                # If confidence is high, we trust lingua-py's specialized result.
                return top_language.iso_code_639_1.name.lower()

        # Fallback to langdetect if lingua-py is not confident or fails.
        # This will now correctly handle Igbo, Hausa, and other languages.
        return detect(text)

    except Exception:
        # If either library fails for any reason
        return hint or ""


def is_social(href: str) -> bool:
    """
    Checks if a given URL href points to a specific social media post/claim,
    including archived versions of deleted posts.

    Returns True for:
    - Direct social media post links
    - Archived versions of social media posts (Wayback Machine, Archive.today, etc.)

    Returns False for:
    - Social media profile pages, platform homepages, or non-social links
    """
    if not href:
        return False

    href = href.strip()

    # First check if this is an archived link
    original_url = extract_original_from_archive(href)
    if original_url:
        # If it's an archive link, check the original URL
        return is_direct_social_link(original_url)

    # If not an archive link, check directly
    return is_direct_social_link(href)


def extract_original_from_archive(href: str) -> str:
    """
    Extract original URL from various web archive services.
    Returns empty string if not an archive link.
    """
    href_lower = href.lower()

    # Internet Archive (Wayback Machine)
    # Format: https://web.archive.org/web/TIMESTAMP/ORIGINAL_URL
    wayback_match = re.search(r'web\.archive\.org/web/\d+/(.*)', href)
    if wayback_match:
        original_url = wayback_match.group(1)
        # Handle cases where the original URL might be URL-encoded
        return unquote(original_url)

    # Archive.today/Archive.is (multiple domains)
    # Format: https://archive.today/ARCHIVE_ID or https://archive.is/ARCHIVE_ID
    # Note: These don't contain the original URL directly, but we can identify them
    archive_today_match = re.search(r'archive\.(today|is|fo|ph|md|vn)/([A-Za-z0-9]+)', href_lower)
    if archive_today_match:
        # For archive.today links, we need to indicate it's an archive
        # but we can't extract the original URL without making a request
        # We'll mark it as a potential social archive link
        return "ARCHIVE_LINK_DETECTED"

    # Perma.cc (Harvard Law's permanent archive service)
    # Format: https://perma.cc/ARCHIVE_ID
    perma_match = re.search(r'perma\.cc/([A-Z0-9-]+)', href_lower)
    if perma_match:
        return "ARCHIVE_LINK_DETECTED"

    # Google Cache
    # Format: https://webcache.googleusercontent.com/search?q=cache:ORIGINAL_URL
    cache_match = re.search(r'webcache\.googleusercontent\.com/search\?q=cache:(.*)', href)
    if cache_match:
        cached_url = cache_match.group(1)
        return unquote(cached_url)

    # Archive.org direct items
    # Format: https://archive.org/details/ITEM_ID
    archive_org_match = re.search(r'archive\.org/details/([^/]+)', href_lower)
    if archive_org_match:
        item_id = archive_org_match.group(1)
        # Check if the item ID suggests social media content
        if any(platform in item_id for platform in ['twitter', 'facebook', 'instagram', 'tiktok', 'youtube']):
            return "ARCHIVE_LINK_DETECTED"

    # Other archive services (add more as needed)
    other_archives = [
        r'freezepage\.com',
        r'archive\.ph',
        r'arquivo\.pt',
        r'webarchive\.org\.uk',
        r'timetravel\.mementoweb\.org'
    ]

    for archive_pattern in other_archives:
        if re.search(archive_pattern, href_lower):
            return "ARCHIVE_LINK_DETECTED"

    return ""  # Not an archive link


def is_direct_social_link(href: str) -> bool:
    """
    Check if URL is a direct social media post link (not archived).
    """
    if not href:
        return False

    # Handle special archive detection marker
    if href == "ARCHIVE_LINK_DETECTED":
        return True  # detected it's an archive of social content

    href = href.lower().strip()

    # Check if it's even a social media domain
    social_domains = ("facebook.com", "twitter.com", "x.com", "tiktok.com",
                     "instagram.com", "youtube.com", "whatsapp.com", "threads.net",
                     "linkedin.com", "youtu.be", "fb.com", "wa.me")

    if not any(domain in href for domain in social_domains):
        return False

    # Twitter/X: Must have "/status/" + tweet ID
    if any(domain in href for domain in ["twitter.com", "x.com"]):
        if '/status/' in href:
            tweet_match = re.search(r'/status/(\d+)', href)
            if tweet_match:
                tweet_id = tweet_match.group(1)
                return len(tweet_id) >= 15  # Valid tweet IDs are 15+ digits
        return False  # Twitter link without status = profile page

    # Instagram: Must have "/p/", "/reel/", or "/tv/" + post ID
    if "instagram.com" in href:
        post_patterns = [r'/p/[A-Za-z0-9_-]+/', r'/reel/[A-Za-z0-9_-]+/', r'/tv/[A-Za-z0-9_-]+/']
        return any(re.search(pattern, href) for pattern in post_patterns)

    # YouTube: Must have video ID parameter
    if any(domain in href for domain in ["youtube.com", "youtu.be"]):
        video_patterns = [
            r'[?&]v=([a-zA-Z0-9_-]+)',  # ?v=VIDEO_ID or &v=VIDEO_ID
            r'youtu\.be/([a-zA-Z0-9_-]+)',  # youtu.be/VIDEO_ID
            r'/embed/([a-zA-Z0-9_-]+)',  # /embed/VIDEO_ID
        ]
        for pattern in video_patterns:
            match = re.search(pattern, href)
            if match:
                video_id = match.group(1)
                return len(video_id) >= 10  # Valid YouTube video IDs are 11 chars typically
        return False  # YouTube link without video ID = channel/profile page

    # Facebook: Must have specific post indicators
    if any(domain in href for domain in ["facebook.com", "fb.com"]):
        post_patterns = [
            r'/posts/\d+',  # /posts/123456789
            r'/permalink\.php\?story_fbid=',  # Permalink format
            r'/photo\.php\?fbid=',  # Photo posts
            r'/videos/\d+',  # Video posts
            r'/story\.php\?story_fbid=',  # Story format
        ]
        return any(re.search(pattern, href) for pattern in post_patterns)

    # TikTok: Must have specific video patterns
    if "tiktok.com" in href:
        video_patterns = [
            r'/@[^/]+/video/(\d+)',  # /@username/video/1234567890
            r'/video/(\d+)',  # /video/1234567890
        ]
        for pattern in video_patterns:
            match = re.search(pattern, href)
            if match:
                video_id = match.group(1)
                return len(video_id) >= 15  # TikTok video IDs are typically 19 digits
        return False  # TikTok link without video ID = profile page

    # LinkedIn: Must have specific post/activity patterns
    if "linkedin.com" in href:
        post_patterns = [
            r'/posts/[^/]+-activity-\d+',  # /posts/username_activity-123456789
            r'/feed/update/urn:li:activity:\d+',  # Feed update format
            r'/pulse/[^/]+-[^/]+',  # Pulse articles
        ]
        return any(re.search(pattern, href) for pattern in post_patterns)

    # Threads: Must have specific post pattern
    if "threads.net" in href:
        return bool(re.search(r'/@[^/]+/post/[A-Za-z0-9_-]+', href))

    # WhatsApp: Must have sharing parameters (status/message links)
    if any(domain in href for domain in ["wa.me", "whatsapp.com"]):
        return bool(re.search(r'(text=|phone=)', href))

    # If we get here, it's a social domain but not a specific content link
    return False


def get_archive_info(href: str) -> dict:
    """
    Extract archive information from a URL.
    Returns dictionary with archive details or empty dict if not archived.
    """
    info = {
        "is_archived": False,
        "archive_service": "",
        "original_url": "",
        "archive_date": "",
        "archive_id": ""
    }

    if not href:
        return info

    href_lower = href.lower()

    # Wayback Machine
    wayback_match = re.search(r'web\.archive\.org/web/(\d{14})/(.*)', href)
    if wayback_match:
        info["is_archived"] = True
        info["archive_service"] = "Internet Archive (Wayback Machine)"
        info["archive_date"] = wayback_match.group(1)  # YYYYMMDDHHMMSS format
        info["original_url"] = unquote(wayback_match.group(2))
        return info

    # Archive.today variants
    archive_today_match = re.search(r'archive\.(today|is|fo|ph|md|vn)/([A-Za-z0-9]+)', href_lower)
    if archive_today_match:
        info["is_archived"] = True
        info["archive_service"] = f"Archive.{archive_today_match.group(1)}"
        info["archive_id"] = archive_today_match.group(2)
        return info

    # Perma.cc
    perma_match = re.search(r'perma\.cc/([A-Z0-9-]+)', href_lower)
    if perma_match:
        info["is_archived"] = True
        info["archive_service"] = "Perma.cc"
        info["archive_id"] = perma_match.group(1)
        return info

    # Google Cache
    cache_match = re.search(r'webcache\.googleusercontent\.com/search\?q=cache:(.*)', href)
    if cache_match:
        info["is_archived"] = True
        info["archive_service"] = "Google Cache"
        info["original_url"] = unquote(cache_match.group(1))
        return info

    return info

def classify_platform(url: str) -> str:
    # Classifies the platform based on the URL.
    # Returns a specific platform name if matched, otherwise "Web/other".
    mapping = {
        "facebook.com": "Facebook", "twitter.com": "Twitter/X", "x.com": "Twitter/X",
        "tiktok.com": "TikTok", "instagram.com": "Instagram", "youtube.com": "YouTube",
        "whatsapp.com": "WhatsApp", "threads.net": "Threads", "linkedin.com": "LinkedIn"
    }
    for dom, name in mapping.items():
        if dom in url:
            return name
    return "Web/other"

def classify_domain(tags: list, url: str, title: str) -> str:
    """
    Classifies the article into a domain based on keywords in its tags, URL, and title.
    """
    # Expanded mapping to cover more topics
    domain_map = {
        "Politics": [
            "politics", "election", "government", "inec", "apc", "pdp", "osun", "ondo", "kogi",
            "lagos", "senate", "lawmaker", "house of reps", "presidency", "governor", "minister",
            "assembly", "tribunal", "manifesto", "campaign", "oshimole", "obasanjo", "zoning", "judiciary",
            # Yoruba
            "oselu", "idibo", "ijoba", "alakoso", "asoju", "aare", "gomina", "ibosipo",
            # Hausa
            "siyasa", "zabe", "gwamnati", "inukum", "yan majalisa", "shugaba", "sakataren", "gwamna"
        ],
        "Health": [
            "health", "covid-19", "ebola", "lassa fever", "hospital", "medical", "vaccine",
            "chikungunya", "cholera", "malaria", "nysc", "fertility", "pregnancy", "maternal",
            "nhis", "medicine", "immunization", "hiv", "baban gida", "diabetes", "cancer",
            # Yoruba
            "ilera", "iwosan", "itọju", "kokoro", "ajẹsára", "ile-iwosan", "tí obìnrin fi ń loyun", "àyà", "àìlera", "kokoro HIV", "arun inu ọkan",
            # Hausa
            "lafiya", "asibiti", "likita", "magani", "rigakafi", "ciki", "haihuwa", "jinya", "zazzabin cizon sauro", "zazzabin lassa", "cutar kanjamau"
        ],
        "Security": [
            "security", "boko haram", "insecurity", "police", "military", "efcc", "icpc", "ipob", "bandits",
            "ta'addanci", "kidnap", "robbery", "sars", "terrorism", "drug", "herdsmen", "amotekun", "vigilante",
            "dsd", "nscdc", "officer", "navy", "airforce", "crime", "violence", "bomb",
            # Yoruba
            " aabo", "olopa", "ọmọ ogun", "lẹtisi", "ole", "ikọlu", "olè", "ipaniyan", "ogun",
            # Hausa
            "tsaro", "yan sanda", "sojoji", "laifi", "barayi", "ta'addanci", "garkuwa da mutane", "fashi da makami"
        ],
        "Economy": [
            "economy", "business", "naira", "cbn", "fuel", "subsidy", "bvn", "nin", "customs", "kasuwar",
            "inflation", "market", "trade", "budget", "investment", "agriculture", "tax", "loan",
            "poverty", "fintech", "pension", "salary",
            # Yoruba
            "ilo-aje", "ọjà", "owo", "owo orílẹ̀-èdè", "ṣowo", "owo-ìbáyè", "ilana", "isowo", "ifowopamọ́",
            # Hausa
            "tattalin arziki", "kasuwanci", "naira", "man fetur", "tara", "haraji", "zuba jari", "kudi", "rancen", "talauci"
        ],
        "Social Issues": [
            "social", "education", "school", "university", "waec", "jamb", "strike", "teachers", "youth", "women",
            "children", "religion", "church", "mosque", "imam", "pastor", "gender", "marriage", "hijab", "aure", "mata",
            "rape", "abuse", "orphanage", "almajiri", "divorce", "nyanya", "out-of-school", "girl-child",
            # Yoruba
            "awujọ", "ẹkọ", "ile-iwe", "yunifasiti", "oluko", "ọmọde", "obinrin", "okunrin", "ijọ", "imọlẹ", "adúláwọ̀", "iyawo",
            # Hausa
            "zamantakewa", "ilimi", "makaranta", "dalibi", "malami", "matasa", "mata", "yara", "addini", "coci", "masallaci", "aure", "divorce", "hijabi"
        ],
        "Fact-Check/Media": [
            "fact-check", "media", "misinformation", "disinformation", "hoax", "fake news", "viral",
            "claim", "tweet", "whatsapp", "forward", "rumour", "spread", "photo", "video", "audio",
            "false", "debunk", "verify",
            # Yoruba
            "ayewo otito", "iro", "aworan", "fidio", "gbo", "igbega iro",
            # Hausa
            "tabbatar da gaskiya", "karya", "hoton karya", "bidiyon jabu", "jita-jita"
        ],
        "Foreign Affairs": [
            "international", "foreign", "russia", "ukraine", "israel", "iran", "finland", "america",
            "ghana", "ecowas", "un", "uk", "diaspora", "chad", "ivory coast", "diplomat", "japan",
            "embassy", "abroad", "passport", "visa",
            # Yoruba
            "káríayé", " orilẹ-ede", "ajo",
            # Hausa
            "kasashen waje", "jakadanci", "fasfo", "biza"
        ],
        "Environment": [
            "environment", "flood", "climate", "erosion", "rainfall", "desertification", "oil spill",
            "deforestation", "wildlife", "lagoon", "pollution", "waste", "drought", "bush burning",
            "weather", "temperature", "solar", "wind", "renewable",
            # Yoruba
            "ayika", "omilu", "ojo", "ifaseyin", "erukeru", "irinse", "imo-tutu", "ijamba",
            # Hausa
            "muhalli", "ambaliya", "guguwa", "ruwan sama", "daji", "gurbacewa", "daskararwa"
        ],
        "Technology/Innovation": [
            "technology", "tech", "startup", "innovation", "ict", "digital", "internet", "data", "ai",
            "blockchain", "cryptocurrency", "app", "software", "hardware", "solar", "gsm", "telco",
            "ncc", "npf", "nitel", "mtn", "glo", "etisalat", "5g",
            # Yoruba
            "imotuntun", "imoye", "ọjọgbọn", "ayelujara", "ọrọ-igbaniwọle",
            # Hausa
            "fasaha", "intanet", "sabunta", "wayar salula"
        ],
        "Scam & Fraud": [
            "scam", "419", "fraud", "phishing", "lottery", "winner", "urgent", "money", "fake", "blackmail",
            "atm", "card", "bank", "job offer", "investment", "advance fee", "testimony", "loan", "fraudster",
            # Yoruba
            "jegudujera", "ole", "owo-jebu", "owo-kibo", "iṣowo aṣejẹ", "itanjẹ",
            # Hausa
            "zamba", "damfara", "burtu", "karya", "banki", "katin"
        ],
        "Culture/History": [
            "culture", "history", "tradition", "language", "ethnic", "festival", "yoruba", "hausa", "igbo",
            "heritage", "royal", "oba", "emir", "sarki", "chief", "chieftaincy", "legend", "oral", "folklore",
            # Yoruba
            "asa", "itan", "adehun", "ede", "ajọyọ", "yoruba", "oba", "ojogbon",
            # Hausa
            "al'adu", "tarihi", "yare", "sarki", "sarauta", "biki", "bahaushe", "bature"
        ],
        "Sports/Entertainment": [
            "sports", "football", "olympics", "super eagles", "fifa", "entertainment", "music", "movie",
            "nollywood", "celebrity", "actor", "actress", "song", "album", "club", "premier league",
            # Yoruba
            "ere idaraya", "bola", "orin", "filimu", "fiimu", "ayeye", "osan",
            # Hausa
            "wasanni", "kwallon kafa", "waka", "fim", "wakoki"
        ],
        "Transportation": [
            "transportation", "road", "rail", "train", "lagos-ibadan", "flight", "plane", "airport",
            "danfo", "vehicle", "driver", "traffic", "accident", "bridge", "seaport", "bus", "ticket",
            # Yoruba
            "irin-ajo", "opopona", " ọkọ̀ ayọ́kẹ́lẹ́", "ona oko",
            # Hausa
            "jirgin kasa", "titi", "motar", "direba", "hanya", "jirgin sama"
        ],
        "Governance/Public Service": [
            "governance", "public service", "civil servant", "reform", "parastatal", "ministry",
            "commission", "project", "contract", "award", "policy", "ngo", "cso",
            # Yoruba
            "ijoba", "ise ijoba", "oludari", "amojuto", "ofisi", "agbanisiṣẹ",
            # Hausa
            "gudanarwa", "aikin gwamnati", "ma’aikata", "koma da tsarin", "sashi", "ma’aikata gwamnati"
        ],
        "Religion": [
            "religion", "islam", "christianity", "pastor", "imam", "church", "mosque", "pilgrimage",
            "ramadan", "hajj", "fast", "prayer",
            # Yoruba
            "ẹsin", "islam", "kristẹni", "alufa", "imamu", "ijo", "masalasi", "adua", "ireti",
            # Hausa
            "addini", "musulunci", "kiristanci", "liman", "fasto", "coci", "masallaci", "sallah", "addua"
        ],
        "Miscellaneous": [
            "miscellaneous", "other", "unknown", "general", "misc", "uncategorized",
            # Yoruba
            "miran", "aitan",
            # Hausa
            "wani", "sauran", "bambanci"
        ]
    }


    # Combine all text sources for a comprehensive check
    combined_text = " ".join(tags).lower() + " " + url.lower() + " " + title.lower()

    for domain, keywords in domain_map.items():
        if any(keyword in combined_text for keyword in keywords):
            return domain
    return "Other"

VIDEO_REGEX = re.compile(r'\b(video|watch|vlog)\b', re.I)

def build_record(platform, url, claim, verdict, links, meta, article_body, domain, claim_language=None):
    """
    Builds a structured record.
    """
    print(f"Passed claim language: {claim_language}")
    record = {
        "id": str(uuid.uuid4()),
        "extracted_claim_text": claim,
        "verdict": verdict,
        "domain": domain,
        "claim_language": claim_language if claim_language else safe_detect(claim, claim_language),
        "claim_links": links,
        "claim_platforms": list({classify_platform(l) for l in links}) or ["unlinked"],
        "source_url": url,
        "article_body": article_body,
        "platform": platform,
        "metadata": meta,
    }
    return record

#### Filters and metrics utils class

In [None]:
class ContentFilter:
    # Patterns for roundups, media, location, opinion, ads, and obituaries
    roundup_patterns = re.compile(
        r"round-?up|weekly\s+digest|monthly\s+(review|highlights)|top\s+\d+\s+investigations|"
        r"takaitawa|kojọpọ|akopọ|nchịkọta|nyocha",
        re.I
    )

    media_patterns = re.compile(
        r"\b(video|watch|vlog|picture|photo|youtube|vimeo|livestream|fact-check shows:|explainer video|"
        r"bidiyo|hoto|fidio|fíìmù|aworan|vidio|foto|ihe\s+nkiri)\b",
        re.I
    )

    nigeria_kw = re.compile(
        r"\b(nigeria|abuja|lagos|kano|nigerian|najeriya|legos|kano|nàìjíríà|àbújá|èkó|naìjíríyà|àbùjà|ègósì)\b",
        re.I
    )

    exclude_kw = re.compile(
        r"\b(ghana|kenya|south\s+africa|uganda|sierra|cameroon|liberia|congo|kenyan|gana|kamaru|laberiya|kamẹroonì|làìbéríà|kamerun|laberia)\b",
        re.I
    )

    opinion_patterns = re.compile(
        r"opinion|commentary|op-ed|editorial|my take|ra'ayi|ìmọ̀ràn|o rò pé|echiche|nkwado",
        re.I
    )

    ad_patterns = re.compile(
        r"sponsored|advertisement|promo|in partnership with|paid for|tallafi|talla|ìpolówó|onigbọwọ|mgbasa ozi",
        re.I
    )

    obituary_patterns = re.compile(
        r"obituary|tribute|condolence|remembering|passed away|rasuwa|ta'aziyya|iku|ìdúró-ìyìn|àbúrò|ọnwụ|nsọpụrụ|nwụọ",
        re.I
    )

    def should_include(self, title: str, text: str, url: str) -> bool:
        title_and_url = f"{title} {url}"
        full_blob = f"{title} {text} {url}"

        # 1. Check for media, opinion, ads, or obituaries
        if self.media_patterns.search(title_and_url):
            return False
        if self.opinion_patterns.search(full_blob):
            return False
        if self.ad_patterns.search(full_blob):
            return False
        if self.obituary_patterns.search(full_blob):
            return False

        # 2. Check for round-up articles
        if self.roundup_patterns.search(full_blob):
            return False

        # 3. Check for non-Nigeria articles
        if self.exclude_kw.search(full_blob) and not self.nigeria_kw.search(full_blob):
            return False

        return True

class LRLMetrics:
    def __init__(self):
        self.total = self.lrl = 0
        self.lang_counts = {"ha":0,"yo":0,"ig":0,"en":0,"code":0,"unknown":0}

    def update(self, lang_info):
        self.total += 1
        primary = lang_info.get("primary_language", "unknown")
        lrls    = lang_info.get("detected_lrls", [])
        codesw  = lang_info.get("is_code_switched", False)

        if primary in self.lang_counts:
            self.lang_counts[primary] += 1
        elif primary == "en":
            self.lang_counts["en"] += 1
        else:
            self.lang_counts["unknown"] += 1

        if lrls or primary in ("ha", "yo", "ig"):
            self.lrl += 1
        if codesw:
            self.lang_counts["code"] += 1

    def share(self):
        return 0 if not self.total else (self.lrl / self.total) * 100

### Platforms Crawler and Parser

##### Platform: Dubawa **(dubawa.org)**

In [None]:
ROUNDUP_REGEX = re.compile(r"round-?up|investigations", re.I)

def iter_dubawa_article_urls(name: str, max_pages: int, cache: dict):
    """
    Dubawa iterator with Nigeria keyword detection, smart pagination,
    and comprehensive error handling.
    """
    root = "https://dubawa.org"
    start_page = cache.get("iter_state", {}).get(name, {}).get("page", 1)

    # Enhanced Nigeria keywords (comprehensive list)
    essential_keywords = [
        # Core locations
        "nigeria", "nigerian", "abuja", "lagos", "kano", "kaduna", "port harcourt", "ibadan",
        "katsina", "maiduguri", "jos", "ilorin", "owerri", "enugu", "abeokuta", "sokoto",
        # Political figures (current and recent)
        "buhari", "tinubu", "osinbajo", "sanwo-olu", "el-rufai", "wike", "atiku", "obi",
        "shettima", "ganduje", "zulum", "makinde", "soludo", "uzodimma",
        # Key institutions
        "cbn", "central bank", "nnpc", "inec", "efcc", "presidency", "aso rock",
        "nass", "national assembly", "supreme court", "cjn", "icpc", "dss",
        # Economic/cultural
        "naira", "nollywood", "afrobeats", "boko haram", "ipob", "biafra",
        # States and major cities
        "cross river", "delta", "rivers", "bayelsa", "akwa ibom", "edo", "ondo",
        "osun", "oyo", "ogun", "ekiti", "kwara", "kogi", "benue", "plateau",
        "nasarawa", "niger", "fct", "kebbi", "zamfara", "sokoto", "katsina",
        "jigawa", "yobe", "borno", "adamawa", "taraba", "gombe", "bauchi"
    ]

    def enhanced_nigeria_check(title: str, preview_text: str = "") -> bool:
        """Nigeria detection with false positive filtering"""
        combined_text = f"{title} {preview_text}".lower()

        # Filter out other African countries if Nigeria not mentioned
        other_countries = [
            "south africa", "kenya", "ghana", "senegal", "cameroon", "chad",
            "niger republic", "benin republic", "togo", "ivory coast", "mali"
        ]

        # If other countries mentioned and Nigeria not mentioned, likely not Nigeria-focused
        if any(country in combined_text for country in other_countries) and "nigeria" not in combined_text:
            return False

        # Check for any Nigeria keywords
        return any(keyword in combined_text for keyword in essential_keywords)

    # Smart pagination state
    consecutive_empty_pages = cache.get("iter_state", {}).get(name, {}).get("empty_pages", 0)
    total_articles_found = 0
    nigeria_articles_found = 0

    # Modified loop condition to stop after max_pages
    for p in range(start_page, start_page + max_pages):
        if p >= start_page + max_pages:
            print(f"[INFO] Reached maximum pages ({max_pages}) for {name}. Stopping.")
            break

        print(f"[DEBUG] Fetching Dubawa listing page: {p}")

        try:
            soup = get_soup(f"{root}/category/fact-check/page/{p}")
            # Save next page after successful fetch
            cache.setdefault("iter_state", {}).setdefault(name, {})["page"] = p + 1
            save_cache(cache)
        except Exception as e:
            print(f"[ERROR] Could not fetch listing page {p} for Dubawa: {e}")
            break # Stop this platform if a listing page fails

        # Multiple selectors for better article discovery
        article_selectors = [
            "h2.post-title a[href]",   # Primary selector
            "h3.post-title a[href]",   # Alternative
            ".entry-title a[href]",    # Fallback
            "article h2 a[href]",      # Another fallback
            ".post-header h2 a[href]"  # Additional fallback
        ]

        page_urls_found = 0
        page_nigeria_urls = 0

        # Try each selector
        for selector in article_selectors:
            articles = soup.select(selector)
            if articles:
                print(f"[DEBUG] Using selector '{selector}' - found {len(articles)} articles")

                for a in articles:
                    try:
                        title = a.get_text(strip=True)
                        url = urljoin(root, a["href"])

                        # Skip roundups and videos
                        if ROUNDUP_REGEX.search(title) or VIDEO_REGEX.search(title) or "/video" in url:
                            continue

                        page_urls_found += 1
                        total_articles_found += 1

                        # Get preview text if available
                        preview_text = ""
                        article_container = a.find_parent('article') or a.find_parent('.post')
                        if article_container:
                            preview_elem = article_container.select_one('.excerpt, .post-excerpt, p')
                            if preview_elem:
                                preview_text = preview_elem.get_text(strip=True)[:200]

                        # Nigeria keyword filtering
                        if enhanced_nigeria_check(title, preview_text):
                            page_nigeria_urls += 1
                            nigeria_articles_found += 1
                            print(f"[FOUND] Nigeria article: {title[:60]}...")
                            yield url

                    except Exception as e:
                        print(f"[WARN] Error processing article link: {e}")
                        continue

                break  # Successfully used a selector, no need to try others

        print(f"[DEBUG] Page {p}: Found {page_urls_found} total articles, {page_nigeria_urls} Nigeria articles")

        # Smart pagination logic
        if page_nigeria_urls == 0:
            consecutive_empty_pages += 1
            print(f"[INFO] No Nigeria articles on page {p}. Empty pages: {consecutive_empty_pages}/3")
        else:
            consecutive_empty_pages = 0  # Reset counter when content found

        # Update cache with empty page tracking
        cache.setdefault("iter_state", {}).setdefault(name, {})["empty_pages"] = consecutive_empty_pages
        save_cache(cache)

        # Stop after 3 consecutive empty pages
        if consecutive_empty_pages >= 3:
            print(f"[INFO] Stopping pagination after {consecutive_empty_pages} consecutive empty pages for {name}.")
            break

    print(f"[SUMMARY] {name}: {total_articles_found} total articles, {nigeria_articles_found} Nigeria articles")


def parse_dubawa(url: str, lang_code: str = "en"):
    """
    Comprehensively Dubawa parser with detailed metadata extraction, claim/verdict detection, and robust error handling.
    """
    print(f"[DEBUG] Parsing Dubawa article: {url} (Language: {lang_code})")

    try:
        soup = get_soup(url)
        if not soup:
            print(f"[ERROR] Failed to get soup for {url}")
            return [{"error": "Failed to fetch HTML", "url": url}]
    except Exception as e:
        print(f"[ERROR] Failed to fetch article: {e}")
        return [{"error": f"Fetch failed: {e}", "url": url}]

    # ENHANCED BODY TEXT EXTRACTION
    def extract_body_text():
        """Extract body text with multiple fallback selectors"""
        body_selectors = [
            "article .entry-content",     # Primary Dubawa content
            "article .post-content",      # Post content
            ".single-post-content",       # Single post
            "article",                    # Article element
            ".content",                   # General content
            "main .post",                 # Main post area
            ".post-body"                  # Post body
        ]

        for selector in body_selectors:
            element = soup.select_one(selector)
            if element:
                text = element.get_text(" ", strip=True)
                cleaned_text = text.replace('Getting your Trinity Audio player ready', ' ')
                if len(cleaned_text) > 100:  # Ensure substantial content
                    print(f"[DEBUG] Body text extracted using: {selector}")
                    return cleaned_text

        # Final fallback
        return soup.get_text(" ", strip=True).replace('Getting your Trinity Audio player ready', ' ') if soup else ""

    body_text = extract_body_text()
    print(f"[DEBUG] Extracted body text: {len(body_text)} characters")

    # COMPREHENSIVE METADATA EXTRACTION
    def extract_comprehensive_metadata():
        """Extract comprehensive metadata including author, dates, images, etc."""
        meta = {
            "country": "NG",
            "language": lang_code,
            "extraction_timestamp": datetime.datetime.now().isoformat(),
            "source_url": url,
            "platform": "Dubawa"
        }

        # AUTHOR INFORMATION
        author_selectors = [
            ".author-name",                    # Direct author name
            ".post-author a",                  # Post author link
            ".byline a",                       # Byline
            'a[rel="author"]',                 # Author rel attribute
            ".entry-meta .author a",           # Entry meta author
            'meta[name="author"]',             # Meta author
            ".post-meta .author",              # Post meta author
            ".article-author a"                # Article author
        ]

        authors = []
        for selector in author_selectors:
            try:
                elements = soup.select(selector)
                for elem in elements:
                    if elem.name == 'meta':
                        author_text = elem.get('content', '').strip()
                    else:
                        author_text = elem.get_text(strip=True)

                    if author_text and author_text not in authors and len(author_text) < 100:
                        authors.append(author_text)
            except Exception:
                continue

        meta["authors"] = authors if authors else None
        meta["primary_author"] = authors[0] if authors else None

        # PUBLICATION DATE
        date_selectors = [
            'meta[property="article:published_time"]',
            'time[datetime]',
            '.published-date',
            '.post-date',
            '.entry-date',
            'meta[name="publish-date"]'
        ]

        for selector in date_selectors:
            try:
                elem = soup.select_one(selector)
                if elem:
                    if elem.name == 'meta':
                        date_str = elem.get('content')
                    elif elem.name == 'time':
                        date_str = elem.get('datetime') or elem.get_text(strip=True)
                    else:
                        date_str = elem.get_text(strip=True)

                    if date_str:
                        meta["published_at"] = date_str
                        break
            except Exception:
                continue

        # MODIFIED DATE
        modified_selectors = [
            'meta[property="article:modified_time"]',
            'time[class*="modified"]',
            '.updated-date'
        ]

        for selector in modified_selectors:
            try:
                elem = soup.select_one(selector)
                if elem:
                    if elem.name == 'meta':
                        date_str = elem.get('content')
                    elif elem.name == 'time':
                        date_str = elem.get('datetime') or elem.get_text(strip=True)
                    else:
                        date_str = elem.get_text(strip=True)

                    if date_str:
                        meta["modified_at"] = date_str
                        break
            except Exception:
                continue

        # TAGS AND CATEGORIES
        tag_selectors = [
            'a[rel="tag"]',
            '.post-tags a',
            '.tags a',
            '.entry-tags a',
            'meta[name="keywords"]'
        ]

        tags = set()
        for selector in tag_selectors:
            try:
                if 'meta' in selector:
                    elem = soup.select_one(selector)
                    if elem and elem.get('content'):
                        tags.update([t.strip() for t in elem.get('content').split(',')])
                else:
                    for elem in soup.select(selector):
                        tag_text = elem.get_text(strip=True)
                        if tag_text:
                            tags.add(tag_text)
            except Exception:
                continue

        meta["tags"] = list(tags) if tags else []

        # CATEGORIES
        category_selectors = [
            '.post-category a',
            '.category a',
            '.entry-category a'
        ]

        categories = []
        for selector in category_selectors:
            try:
                for elem in soup.select(selector):
                    cat_text = elem.get_text(strip=True)
                    if cat_text and cat_text not in categories:
                        categories.append(cat_text)
            except Exception:
                continue

        meta["categories"] = categories if categories else []

        # IMAGE EXTRACTION
        image_selectors = [
            'article img[src]',
            '.post-content img[src]',
            '.entry-content img[src]',
            '.featured-image img[src]',
            'meta[property="og:image"]',
            'meta[name="twitter:image"]'
        ]

        image_urls = set()
        for selector in image_selectors:
            try:
                if 'meta' in selector:
                    elem = soup.select_one(selector)
                    if elem and elem.get('content'):
                        img_url = elem.get('content')
                        if img_url.startswith('http'):
                            image_urls.add(img_url)
                        elif img_url.startswith('/'):
                            image_urls.add(urljoin(url, img_url))
                else:
                    for img in soup.select(selector):
                        src = img.get('src') or img.get('data-src')
                        if src:
                            if src.startswith('http'):
                                image_urls.add(src)
                            elif src.startswith('/'):
                                image_urls.add(urljoin(url, src))
            except Exception:
                continue

        meta["image_urls"] = list(image_urls)

        # FEATURED IMAGE
        featured_selectors = [
            'meta[property="og:image"]',
            '.featured-image img[src]',
            '.post-thumbnail img[src]'
        ]

        for selector in featured_selectors:
            try:
                if 'meta' in selector:
                    elem = soup.select_one(selector)
                    if elem and elem.get('content'):
                        meta["featured_image"] = elem.get('content')
                        break
                else:
                    img = soup.select_one(selector)
                    if img and img.get('src'):
                        src = img.get('src')
                        if src.startswith('http'):
                            meta["featured_image"] = src
                        elif src.startswith('/'):
                            meta["featured_image"] = urljoin(url, src)
                        break
            except Exception:
                continue

        # WORD COUNT
        if body_text:
            meta["word_count"] = len(body_text.split())

        # CONTENT TYPE
        title = soup.title.get_text() if soup.title else ""
        if any(term in title.lower() for term in ['fact-check', 'factcheck', 'dubawa']):
            meta["content_type"] = "fact-check"
        else:
            meta["content_type"] = "article"

        return meta

    meta = extract_comprehensive_metadata()

    # CLAIM EXTRACTION
    def extract_claim():
        """Enhanced claim extraction with multiple strategies"""

        # Strategy 1: Look for structured claim sections first
        claim_selectors = [
            '.claim',                         # Direct claim class
            '.fact-check-claim',             # Fact-check specific
            '.statement',                    # Statement being checked
            '[data-claim]',                  # Data attribute
            '.highlighted-claim',            # Highlighted claims
            '.post-claim'                    # Post claim
        ]

        for selector in claim_selectors:
            try:
                element = soup.select_one(selector)
                if element:
                    claim_text = element.get_text(strip=True)
                    if len(claim_text) > 10:
                        print(f"[DEBUG] Claim found via selector '{selector}'")
                        return claim_text
            except Exception:
                continue

        # Strategy 2: Enhanced strong element parsing (original strategy improved)
        try:
            for strong in soup.select("p strong, div strong, article strong, h3 strong, h4 strong"):
                txt = strong.get_text(strip=True).lower()
                if any(txt.startswith(indicator) for indicator in ["claim", "the claim", "allegation", "statement"]):
                    parent_text = strong.parent.get_text(strip=True)

                    # Extract text after the claim indicator
                    for indicator in ["claim:", "the claim:", "allegation:", "statement:"]:
                        if indicator in parent_text.lower():
                            parts = parent_text.lower().split(indicator, 1)
                            if len(parts) > 1:
                                claim = parts[1].strip(" –:-")
                                if len(claim) > 15:  # Ensure substantial claim
                                    print(f"[DEBUG] Claim found via strong element")
                                    return claim
        except Exception:
            pass

        # Strategy 3: Look for quoted content or blockquotes
        try:
            quote_selectors = ['blockquote', '.quote', 'q', '.claim-quote']
            for selector in quote_selectors:
                quote_elem = soup.select_one(selector)
                if quote_elem:
                    quote_text = quote_elem.get_text(strip=True)
                    if 50 < len(quote_text) < 500:  # Reasonable claim length
                        print(f"[DEBUG] Claim found in quote")
                        return quote_text
        except Exception:
            pass

        # Strategy 4: H1 fallback (original strategy)
        try:
            if soup.h1:
                h1_text = soup.h1.get_text(strip=True)
                # Skip if H1 looks like a generic title
                title_indicators = ['fact check:', 'dubawa:', 'analysis:', 'report:']
                if not any(indicator in h1_text.lower() for indicator in title_indicators):
                    print(f"[DEBUG] Using H1 as claim")
                    return h1_text
        except Exception:
            pass

        # Strategy 5: First substantial paragraph with claim indicators
        try:
            claim_indicators = ['claims that', 'alleged that', 'stated that', 'said that', 'according to']
            for para in soup.select('article p, .content p, .post-content p'):
                para_text = para.get_text(strip=True)
                if any(indicator in para_text.lower() for indicator in claim_indicators):
                    if 50 < len(para_text) < 400:
                        print(f"[DEBUG] Using paragraph with claim indicator")
                        return para_text
        except Exception:
            pass


        return "Claim not found"

    claim = extract_claim()

    # VERDICT EXTRACTION
    def extract_verdict():
        """Enhanced verdict extraction with comprehensive strategies"""

        verdict_context = ""

        # For LRL (Low Resource Languages)
        if lang_code != "en":
            # Comprehensive LRL keywords (expanded from original)
            lrl_verdict_keywords = {
                "ha": {
                    # Original keywords
                    "Gaskiya": "True", "Karya": "False", "Rabin Gaskiya": "Half True",
                    "Bata-gari": "Misleading", "Bata gari": "Misleading",
                    # Expanded Hausa keywords
                    "gaskiya": "True", "karya": "False", "rabin gaskiya": "Half True",
                    "gaskiyar lamari": "True", "ba gaskiya ba": "False", "karya ne": "False",
                    "gaskiya ne": "True", "sahih": "True", "banza": "False",
                    "labarin karya": "False", "labaran karya": "False", "yaudara": "Misleading",
                    "rudu": "Misleading", "kuskure": "False", "daidai": "True",
                    "hukunci": "Verdict", "yanke hukunci": "Verdict"
                },
                "yo": {
                    # Original keywords
                    "Òótọ́": "True", "Irọ́": "False", "Iro ni": "False", "Ooto": "True",
                    "Òótọ́-òótọ́": "Mostly True", "Èké": "Misleading", "Asinilona": "Misleading",
                    "Eke": "Misleading", "Aṣinilọna": "Misleading", "Beeni": "True",
                    # Expanded Yoruba keywords
                    "otito": "True", "iro": "False", "eke": "Misleading", "ooto": "True",
                    "o daju": "True", "ko si": "False", "iro lo": "False", "beeni": "True",
                    "ipinnu": "Verdict", "idajo": "Verdict"
                },
                "ig": {
                    # Original keywords
                    "Eziokwu": "True", "Ụgha": "False", "N'ụzọ dị ukwuu eziokwu": "Mostly True",
                    "Nduhie": "Misleading", "eziokwu": "True", "ụgha": "False",
                    # Expanded Igbo keywords
                    "ezi okwu": "True", "ugha": "False", "adighi ezi": "False",
                    "o ziri ezi": "True", "okwu ụgha": "False", "ezi": "True",
                    "mkpebi": "Verdict", "ikpe": "Verdict"
                }
            }

            if lang_code in lrl_verdict_keywords:
                # Strategy 1: Look in structured verdict sections
                verdict_patterns = [
                    r"(?:VERDICT|HUKUNCI|ÌPINNU|MKPEBI)[:\s–-]+(.*?)(?:\.|\n|$)",
                    r"(?:CONCLUSION|ƘARSHE|IPARI|NCHỊKỌTA)[:\s–-]+(.*?)(?:\.|\n|$)"
                ]

                for pattern in verdict_patterns:
                    try:
                        match = re.search(pattern, body_text, re.IGNORECASE | re.DOTALL)
                        if match:
                            verdict_text = match.group(1).strip()
                            for keyword, mapped_verdict in lrl_verdict_keywords[lang_code].items():
                                if re.search(rf"\b{re.escape(keyword)}\b", verdict_text, re.IGNORECASE):
                                    return mapped_verdict, f"Structured verdict: {verdict_text[:100]}"
                    except Exception:
                        continue

                # Strategy 2: Enhanced body text search with context
                for keyword, mapped_verdict in lrl_verdict_keywords[lang_code].items():
                    try:
                        pattern = rf"\b{re.escape(keyword)}\b"
                        if re.search(pattern, body_text, re.IGNORECASE):
                            # Get context around the keyword
                            match = re.search(rf"(.{{0,50}}\b{re.escape(keyword)}\b.{{0,100}})",
                                            body_text, re.IGNORECASE)
                            if match:
                                context = match.group(1).strip()
                                return mapped_verdict, context
                            else:
                                return mapped_verdict, f"Found keyword: {keyword}"
                    except Exception:
                        continue

        else:
            # Enhanced English verdict extraction
            # Strategy 1: Enhanced strong element parsing (original improved)
            try:
                for strong in soup.select("p strong, div strong, article strong, h3 strong, h4 strong"):
                    txt = strong.get_text(strip=True).lower()
                    if any(txt.startswith(indicator) for indicator in ["verdict", "the verdict", "conclusion", "finding", "result"]):
                        parent_text = strong.parent.get_text(strip=True)

                        # Extract verdict after indicator
                        for indicator in ["verdict:", "the verdict:", "conclusion:", "finding:", "result:"]:
                            if indicator in parent_text.lower():
                                parts = parent_text.lower().split(indicator, 1)
                                if len(parts) > 1:
                                    raw_verdict = parts[1].strip()
                                    # Split on punctuation to get first sentence
                                    if '.' in raw_verdict:
                                        verdict_text, _, context = raw_verdict.partition('.')
                                    elif '!' in raw_verdict:
                                        verdict_text, _, context = raw_verdict.partition('!')
                                    else:
                                        verdict_text, context = raw_verdict, ""

                                    if verdict_text.strip():
                                        return verdict_text.strip().capitalize(), context.strip()
            except Exception:
                pass

            # Strategy 2: Look for verdict in headings
            try:
                heading_selectors = ['h2', 'h3', 'h4', 'h5']
                for selector in heading_selectors:
                    headings = soup.select(selector)
                    for heading in headings:
                        heading_text = heading.get_text(strip=True).lower()
                        if any(word in heading_text for word in ['verdict', 'conclusion', 'finding', 'result']):
                            # Look for verdict in next sibling
                            next_elem = heading.find_next_sibling()
                            if next_elem:
                                verdict_text = next_elem.get_text(strip=True)
                                if 5 < len(verdict_text) < 200:
                                    return verdict_text, "Found via heading"
                            # Check the heading text itself
                            normalized = _africacheck_normalize_verdict(heading_text)
                            if normalized != "Unrated":
                                print(f"[DEBUG] Verdict found in heading text")
                                return normalized, heading_text[:200].strip()
            except Exception:
                pass

            # Strategy 3: Look for conclusion patterns
            try:
                conclusion_patterns = [
                    "the verdict:", "our verdict:", "verdict:", "conclusion:",
                    "in conclusion:", "finding:", "the finding:", "result:"
                ]

                body_lower = body_text.lower()
                for pattern in conclusion_patterns:
                    if pattern in body_lower:
                        verdict_section = body_lower.split(pattern, 1)[-1]
                        first_sentence = verdict_section.strip().split('.')[0]
                        if not first_sentence:
                            first_sentence = verdict_section.strip().split('!')[0]
                        if not first_sentence:
                            first_sentence = verdict_section.strip().split('\n')[0]

                        if first_sentence and len(first_sentence) > 5:
                            return first_sentence.strip(), f"Conclusion pattern: {pattern}"
            except Exception:
                pass

        return "Unrated", "No verdict found using any strategy"

    verdict, vctx = extract_verdict()

    # DOMAIN CLASSIFICATION
    tags = meta.get("tags", [])
    domain = classify_domain(tags, url, claim)
    print(f"[DEBUG] Domain classified as: {domain}")

    # SOCIAL LINKS EXTRACTION
    def extract_social_links():
        """Extract social links with enhanced error handling and deduplication"""
        try:
            links = []
            link_selectors = [
                "article a[href]",
                ".content a[href]",
                ".post-content a[href]",
                ".entry-content a[href]"
            ]

            for selector in link_selectors:
                for a in soup.select(selector):
                    href = a.get("href")
                    if href and is_social(href):
                        links.append(href)

            # Remove duplicates while preserving order
            unique_links = []
            seen = set()
            for link in links:
                if link not in seen:
                    unique_links.append(link)
                    seen.add(link)

            return unique_links
        except Exception as e:
            print(f"[WARNING] Link extraction failed: {e}")
            return []

    links = extract_social_links()

    # UPDATE METADATA
    meta["verdict_context"] = vctx
    meta["parsing_method"] = "enhanced_dubawa_extraction"
    meta["claim_extraction_success"] = claim != "Claim not found"
    meta["verdict_extraction_success"] = verdict != "Unrated"
    meta["social_links_count"] = len(links)

    print(f"[DEBUG] Extracted Claim: {claim[:100]}...")
    print(f"[DEBUG] Extracted Verdict: {verdict}")
    print(f"[DEBUG] Language: {lang_code}, Domain: {domain}")
    print(f"[DEBUG] Author: {meta.get('primary_author', 'Not found')}")
    print(f"[DEBUG] Links found: {len(links)}")
    print(f"[DEBUG] Images found: {len(meta.get('image_urls', []))}")

    # BUILD AND RETURN RECORD
    try:
        record = build_record("Dubawa", url, claim, verdict, links, meta, body_text, domain, claim_language=lang_code)
        return [record]
    except Exception as e:
        print(f"[ERROR] Record building failed: {e}")
        # Fallback record
        return [{
            "error": f"Record building failed: {e}",
            "url": url,
            "claim": claim,
            "verdict": verdict,
            "domain": domain,
            "language": lang_code,
            "metadata": meta
        }]

# def iter_dubawa_article_urls(name: str, max_pages: int, cache: dict):
#     """A stateful iterator that manages its own pagination via the cache."""
#     root = "https://dubawa.org"
#     start_page = cache.get("iter_state", {}).get(name, {}).get("page", 1)

#     for p in range(start_page, start_page + max_pages):
#         print(f"[DEBUG] Fetching Dubawa listing page: {p}")
#         try:
#             soup = get_soup(f"{root}/category/fact-check/page/{p}")
#             # After a successful fetch, save the *next* page number before processing.
#             # This ensures we don't re-crawl this listing page on restart.
#             cache.setdefault("iter_state", {}).setdefault(name, {})["page"] = p + 1
#             save_cache(cache)
#         except Exception as e:
#             print(f"[ERROR] Could not fetch listing page {p} for Dubawa: {e}")
#             break # Stop this platform if a listing page fails

#         urls_found = 0
#         for a in soup.select("h2.post-title a[href]"):
#             title = a.get_text(strip=True)
#             url = urljoin(root, a["href"])
#             if not (ROUNDUP_REGEX.search(title) or VIDEO_REGEX.search(title) or "/video" in url):
#                 urls_found += 1
#                 yield url

#         if urls_found == 0:
#             print(f"[INFO] No articles found on page {p}. Stopping pagination for {name}.")
#             break

# def parse_dubawa(url:str, lang_code: str = "en"):
#     print(f"[DEBUG] Parsing Dubawa article: {url} (Language hint: {lang_code})")
#     soup = get_soup(url)
#     article_element = soup.select_one("article")
#     body_text = article_element.get_text(" ", strip=True) if article_element else ""

#     links = [a["href"] for a in soup.select("article a[href]") if is_social(a["href"])]
#     meta = {
#         "published_at": (p.get("content") if (p := soup.find("meta", {"property": "article:published_time"})) else None),
#         "tags": [t.get_text(strip=True) for t in soup.select("a[rel='tag']")]
#     }

#     claim = soup.h1.get_text(strip=True) if soup.h1 else "Claim not found"
#     tags = meta.get("tags", [])
#     domain = classify_domain(tags, url, claim) # Classify the domain
#     verdict = "Unrated"
#     vctx = ""

#     if lang_code != "en":
#         # LRL Verdict Extraction Logic
#         lrl_verdict_keywords = {
#             "ha": {"Gaskiya": "True", "Karya": "False", "Rabin Gaskiya": "Half True", "Bata-gari": "Misleading"},
#             "yo": {"Òótọ́": "True", "Irọ́": "False", "Iro ni": "False", "Ooto": "True", "Òótọ́-òótọ́": "Mostly True", "Èké": "Misleading", "Asinilona": "Misleading", "Eke": "Misleading", "Aṣinilọna": "Misleading", "Beeni": "True", "Iro ni": "False"},
#             "ig": {"Eziokwu": "True", "Ụgha": "False", "N'ụzọ dị ukwuu eziokwu": "Mostly True", "Nduhie": "Misleading"}
#         }

#         if lang_code in lrl_verdict_keywords:
#             for keyword, mapped_verdict in lrl_verdict_keywords[lang_code].items():
#                 if re.search(f"\\b{keyword}\\b", body_text, re.IGNORECASE):
#                     verdict = mapped_verdict
#                     # Try to get context around the keyword
#                     match = re.search(f"(.{{0,50}}\\b{keyword}\\b.{{0,100}})", body_text, re.IGNORECASE)
#                     if match:
#                         vctx = match.group(1).strip()
#                     break
#         meta["verdict_context"] = vctx or "Verdict extracted using LRL keywords."
#         record = build_record("Dubawa", url, claim, verdict, links, meta, body_text, domain, claim_language=lang_code)
#     else:
#         # Existing logic for English pages
#         for strong in soup.select("p strong"):
#             txt = strong.get_text(strip=True).lower()
#             full = strong.parent.get_text(strip=True)
#             if txt.startswith("claim"):
#                 claim = full[len("claim:"):].strip(" –:-")
#             elif txt.startswith("verdict"):
#                 raw = full[len("verdict:"):].strip()
#                 if '.' in raw:
#                     verdict, _, vctx = raw.partition('.')
#                 else:
#                     verdict, _, vctx = raw.partition('!')
#                 verdict = verdict.strip().capitalize() or "Unrated"
#                 vctx = vctx.strip()

#         if not claim: claim = soup.h1.get_text(strip=True)
#         meta["verdict_context"] = vctx
#         record = build_record("Dubawa", url, claim, verdict, links, meta, body_text, domain)

#     return [record]

##### Platform: AfricaCheck **(africacheck.org)**

In [None]:
# --- HELPER FUNCTIONS (for AfricaCheck Parser) ---
from typing import Dict, Any, Optional

def _africacheck_normalize_verdict(text: str) -> str:
    """
    SIMPLIFIED verdict normalization for AfricaCheck's actual rating system.

    Based on research, AfricaCheck uses these main verdicts:
    - Correct
    - Incorrect
    - Misleading
    - Unproven
    - Mostly Correct
    """
    if not text or not isinstance(text, str):
        return "Unrated"

    # Clean and normalize text
    text_clean = re.sub(r'\s+', ' ', text.strip().lower())

    if len(text_clean) < 2:
        return "Unrated"

    # Direct matches for AfricaCheck's exact terms (case-insensitive)
    if text_clean == "correct":
        return "True"
    elif text_clean == "incorrect":
        return "False"
    elif text_clean == "misleading":
        return "Misleading"
    elif text_clean == "unproven":
        return "False"
    elif text_clean == "mostly correct":
        return "True"

    # Pattern matching for variations (simplified)
    if re.search(r'\b(mostly|largely)\s+(correct|true|accurate)\b', text_clean):
        return "True"
    elif re.search(r'\b(correct|true|accurate|right)\b', text_clean):
        return "True"
    elif re.search(r'\b(incorrect|false|wrong|fake)\b', text_clean):
        return "False"
    elif re.search(r'\bmisleading\b', text_clean):
        return "Misleading"
    elif re.search(r'\b(unproven|unverified|cannot.*verif|insufficient.*evidence)\b', text_clean):
        return "False"

    return "Unrated"


def _africacheck_extract_metadata(soup, url: str = None) -> Dict[str, Any]:
    """
    Comprehensively extracts metadata from Africa Check web pages with robust
    error handling and Nigeria-focused country detection.

    Args:
        soup: BeautifulSoup object of the parsed HTML page
        url (str, optional): URL of the page for country detection and URL resolution

    Returns:
        Dict[str, Any]: Comprehensive metadata dictionary with validated fields

    Extracted Metadata Fields:
        Core Information:
        - title, language, country, content_type
        - extraction_timestamp, source_url

        Publication Details:
        - published_at, modified_at, author,
        - word_count, schema_org_data

        Content Classification:
        - tags, categories, verdict, claim (for fact-checks)

        Media Assets:
        - image_urls, featured_image, videos, social_media_links
    """
    if not soup:
        return {"error": "Invalid soup object provided"}

    metadata = {}

    # Nigeria-focused Country Detection
    def detect_nigeria(url_str: Optional[str], soup_obj) -> str:
        """
        Detect if content is Nigeria-focused, otherwise default to Nigeria.
        This simplified version focuses only on Nigeria detection.
        """
        # Nigeria indicators in URL
        nigeria_indicators = [
            "nigeria", "ng", "lagos", "/ng/",
            "nigerian", "naija", "abuja"
        ]

        if url_str:
            url_lower = url_str.lower()
            for indicator in nigeria_indicators:
                if indicator in url_lower:
                    return "NG"

        # Check content for Nigeria-specific terms
        try:
            page_text = soup_obj.get_text().lower()
            nigeria_terms = ["nigeria", "nigerian", "lagos", "abuja", "naira"]

            for term in nigeria_terms:
                if term in page_text:
                    return "NG"

            # Check meta tags
            for meta in soup_obj.find_all('meta'):
                content = str(meta.get('content', '')).lower()
                for indicator in nigeria_indicators:
                    if indicator in content:
                        return "NG"
        except Exception:
            pass

        # Default to Nigeria since that's the scope
        return "NG"

    def safe_extract(func, default=None):
        """Safely execute extraction function with error handling"""
        try:
            return func()
        except Exception:
            return default

    def normalize_url(url_str: str, base_url: str = None) -> Optional[str]:
        """Normalize and validate URLs"""
        if not url_str:
            return None
        try:
            if url_str.startswith(('http://', 'https://')):
                return url_str
            elif url_str.startswith('//'):
                return 'https:' + url_str
            elif base_url and url_str.startswith('/'):
                return urljoin(base_url, url_str)
            elif base_url:
                return urljoin(base_url, url_str)
            return url_str
        except Exception:
            return None

    # METADATA EXTRACTION

    # Basic Information - Nigeria focused
    metadata['country'] = detect_nigeria(url, soup)
    metadata['extraction_timestamp'] = datetime.datetime.now().isoformat()
    metadata['source_url'] = url

    # Title extraction
    metadata['title'] = safe_extract(
        lambda: soup.find('title').get_text(strip=True) if soup.find('title') else None
    )

    # Language detection (default to English for Nigeria)
    metadata['language'] = safe_extract(lambda: (
        soup.get('lang') or
        soup.find('html', {'lang': True}).get('lang')
    ), default='en')

    # Content Type Detection
    def detect_content_type():
        """Detect the type of Africa Check content"""
        title = (metadata.get('title') or '').lower()
        url_path = (url or '').lower()

        if any(term in title or term in url_path for term in ['fact-check', 'factcheck', '/reports/']):
            return 'fact-check'
        elif '/blog/' in url_path:
            return 'blog'
        elif '/guide' in url_path or 'guide:' in title:
            return 'guide'
        elif 'press-release' in title or '/press-release' in url_path:
            return 'press-release'
        return 'article'

    metadata['content_type'] = detect_content_type()

    # Publication Date Extraction with validation
    date_selectors = [
        'time[datetime]',
        'meta[property="article:published_time"]',
        'meta[name="publish-date"]',
        '.publish-date',
        '.date-published',
        '.field--name-post-date'
    ]

    def extract_and_validate_date(selector):
        """Extract and validate date from selector"""
        element = soup.select_one(selector)
        if not element:
            return None

        date_str = (element.get('datetime') or
                   element.get('content') or
                   element.get_text(strip=True))

        if not date_str:
            return None

        # Validate and normalize date format
        try:
            # Handle common date formats
            if 'T' in date_str:
                # ISO format
                clean_date = date_str.replace('Z', '+00:00') if date_str.endswith('Z') else date_str
                parsed_date = datetime.datetime.fromisoformat(clean_date[:19])
                return parsed_date.isoformat()
            else:
                # Try other formats
                for fmt in ['%Y-%m-%d', '%d %B %Y', '%B %d, %Y']:
                    try:
                        parsed_date = datetime.datetime.strptime(date_str, fmt)
                        return parsed_date.isoformat()
                    except ValueError:
                        continue
            return date_str  # Return original if parsing fails
        except Exception:
            return date_str

    for selector in date_selectors:
        if not metadata.get('published_at'):
            metadata['published_at'] = safe_extract(lambda: extract_and_validate_date(selector))

    # Modified date
    modified_selectors = ['meta[property="article:modified_time"]', 'time[class*="modified"]']
    for selector in modified_selectors:
        if not metadata.get('modified_at'):
            metadata['modified_at'] = safe_extract(lambda: extract_and_validate_date(selector))

    # Author Information
    author_selectors = [
        '.field--name-field-author a',
        '.author-name',
        '.by-author',
        'meta[name="author"]',
        'meta[property="article:author"]',
        '.field--name-uid a',
        'article .author'
    ]

    authors = []
    for selector in author_selectors:
        elements = soup.select(selector)
        for elem in elements:
            author_text = (elem.get('content') if elem.name == 'meta'
                          else elem.get_text(strip=True))
            if author_text and author_text not in authors:
                authors.append(author_text)

    metadata['author'] = authors[0] if len(authors) == 1 else authors if authors else None

    # Tags extraction (comprehensive)
    tag_selectors = [
        '.field--name-field-tags a',
        '.tags a',
        'meta[name="keywords"]',
        '.post-tags a',
        '.article-tags a'
    ]

    tags = set()
    for selector in tag_selectors:
        if 'meta' in selector:
            meta_elem = soup.select_one(selector)
            if meta_elem and meta_elem.get('content'):
                tags.update([t.strip() for t in meta_elem.get('content').split(',')])
        else:
            for elem in soup.select(selector):
                tag_text = elem.get_text(strip=True)
                if tag_text:
                    tags.add(tag_text)

    metadata['tags'] = list(tags) if tags else []

    # Categories
    category_selectors = ['.field--name-field-category a', '.categories a', '.article-category']
    categories = []
    for selector in category_selectors:
        for elem in soup.select(selector):
            cat_text = elem.get_text(strip=True)
            if cat_text and cat_text not in categories:
                categories.append(cat_text)

    metadata['categories'] = categories if categories else []

    # Fact-checking specific fields
    if metadata['content_type'] == 'fact-check':
        # Verdict extraction
        verdict_selectors = ['.verdict', '.fact-check-verdict', '.rating', '.field--name-field-verdict']
        for selector in verdict_selectors:
            if not metadata.get('verdict'):
                metadata['verdict'] = safe_extract(
                    lambda: soup.select_one(selector).get_text(strip=True)
                )

        # Claim extraction
        claim_selectors = ['.claim', '.fact-check-claim', '.field--name-field-claim']
        for selector in claim_selectors:
            if not metadata.get('claim'):
                metadata['claim'] = safe_extract(
                    lambda: soup.select_one(selector).get_text(strip=True)
                )

    # Comprehensive Image Extraction
    image_selectors = [
        'article img[src]',
        '.field--name-field-image img[src]',
        '.featured-image img[src]',
        'img[src]',
        'meta[property="og:image"]',
        'meta[name="twitter:image"]'
    ]

    image_urls = set()

    for selector in image_selectors:
        if 'meta' in selector:
            meta_elem = soup.select_one(selector)
            if meta_elem and meta_elem.get('content'):
                normalized = normalize_url(meta_elem.get('content'), url)
                if normalized:
                    image_urls.add(normalized)
        else:
            for img in soup.select(selector):
                src = img.get('src') or img.get('data-src')
                if src:
                    normalized = normalize_url(src, url)
                    if normalized:
                        image_urls.add(normalized)

    metadata['image_urls'] = list(image_urls)

    # Featured Image
    featured_selectors = ['meta[property="og:image"]', '.featured-image img[src]']
    for selector in featured_selectors:
        if not metadata.get('featured_image'):
            if 'meta' in selector:
                elem = soup.select_one(selector)
                if elem:
                    metadata['featured_image'] = normalize_url(elem.get('content'), url)
            else:
                img = soup.select_one(selector)
                if img:
                    metadata['featured_image'] = normalize_url(img.get('src'), url)

    # Video URLs
    video_selectors = ['video[src]', 'iframe[src*="youtube"]', 'iframe[src*="vimeo"]']
    videos = []
    for selector in video_selectors:
        for elem in soup.select(selector):
            src = elem.get('src')
            if src:
                normalized = normalize_url(src, url)
                if normalized:
                    videos.append(normalized)

    metadata['videos'] = videos if videos else []

    # Word count
    article_content = soup.select_one('article, .content, .post-content, main')
    if article_content:
        text_content = article_content.get_text(strip=True)
        metadata['word_count'] = len(text_content.split()) if text_content else 0
    else:
        metadata['word_count'] = 0

    # Social media links
    social_selectors = [
        'a[href*="twitter.com"]', 'a[href*="facebook.com"]',
        'a[href*="linkedin.com"]', 'a[href*="instagram.com"]'
    ]

    social_links = {}
    for selector in social_selectors:
        platform = selector.split('"')[1].split('.')[0]
        links = [a.get('href') for a in soup.select(selector) if a.get('href')]
        if links:
            social_links[platform] = links

    metadata['social_media_links'] = social_links if social_links else {}

    # Clean up None values and empty collections
    cleaned_metadata = {}
    for key, value in metadata.items():
        if value is not None and value != [] and value != {}:
            cleaned_metadata[key] = value

    return cleaned_metadata

# --- MAIN ITERATOR & PARSER ---
def iter_africacheck_article_urls(name: str, max_pages: int, cache: dict):
    """A stateful iterator that manages its own pagination via the cache."""
    root = "https://africacheck.org"
    start_page = cache.get("iter_state", {}).get(name, {}).get("page", 0)

    # keywords
    essential_keywords = [
        # Core locations
        "nigeria", "nigerian", "abuja", "lagos", "kano", "kaduna", "port harcourt", "ibadan",
        # Political figures
        "buhari", "tinubu", "osinbajo", "sanwo-olu", "el-rufai", "wike", "atiku", "obi",
        # Key institutions
        "cbn", "central bank", "nnpc", "inec", "efcc", "presidency", "aso rock",
        # Economic/cultural
        "naira", "nollywood", "afrobeats"
    ]

    def enhanced_nigeria_check(container_text: str) -> bool:
        """Enhanced Nigeria detection with false positive filtering"""
        text_lower = container_text.lower()

        # Filter out other African countries if Nigeria not mentioned
        other_countries = ["south africa", "kenya", "ghana", "senegal"]
        if any(country in text_lower for country in other_countries) and "nigeria" not in text_lower:
            return False

        # Check for any Nigeria keywords
        return any(keyword in text_lower for keyword in essential_keywords)

    # Smart Pagination State
    consecutive_empty_pages = cache.get("iter_state", {}).get(name, {}).get("empty_pages", 0)

    for p in range(start_page, start_page + max_pages):
        print(f"[DEBUG] Fetching Africa Check listing page: {p}")
        try:
            soup = get_soup(f"{root}/fact-checks?page={p}")
            cache.setdefault("iter_state", {}).setdefault(name, {})["page"] = p + 1
            save_cache(cache)
        except Exception as e:
            print(f"[ERROR] Could not fetch listing page {p} for Africa Check: {e}")
            break

        article_links = set()

        # Container Processing with Error Handling
        containers_found = 0
        for container in soup.select("article, .card, .promo"):
            try:
                containers_found += 1
                container_text = container.get_text(strip=True)

                # Enhanced Nigeria detection
                if enhanced_nigeria_check(container_text):
                    link_tag = container.select_one("a[href]")
                    if link_tag:
                        href = link_tag.get('href', '')
                        if '/fact-checks/' in href:
                            full_url = urljoin(root, href)
                            # Maintain original video filtering
                            if not (VIDEO_REGEX.search(link_tag.get_text(strip=True)) or "/video" in full_url):
                                article_links.add(full_url)
                                print(f"[FOUND] Nigeria article: {link_tag.get_text(strip=True)[:50]}...")

            except Exception as e:
                print(f"[WARNING] Error processing container: {e}")
                continue

        print(f"[DEBUG] Page {p}: Processed {containers_found} containers, found {len(article_links)} Nigeria articles")

        # Smart Pagination Logic
        if not article_links:
            consecutive_empty_pages += 1
            print(f"[INFO] No Nigeria articles on page {p}. Empty pages: {consecutive_empty_pages}/3")
        else:
            consecutive_empty_pages = 0  # Reset counter when content found

        # Update cache with empty page tracking
        cache.setdefault("iter_state", {}).setdefault(name, {})["empty_pages"] = consecutive_empty_pages
        save_cache(cache)

        # Stop after 3 consecutive empty pages instead of 1
        if consecutive_empty_pages >= 3:
            print(f"[INFO] Stopping pagination after {consecutive_empty_pages} consecutive empty pages for {name}.")
            break

        for url in article_links:
            yield url


def parse_africacheck(url: str, lang_code: str = "en"):
    """
    Enhanced AfricaCheck Parser with comprehensive claim and verdict extraction,
    robust error handling, and multilingual support.
    """
    print(f"[DEBUG] Parsing Africa Check article: {url} (Language: {lang_code})")

    try:
        soup = get_soup(url)
        if not soup:
            print(f"[ERROR] Failed to get soup for {url}")
            return [{"error": "Failed to fetch HTML", "url": url}]
    except Exception as e:
        print(f"[ERROR] Failed to fetch article: {e}")
        return [{"error": f"Fetch failed: {e}", "url": url}]

    # Step 1. EXTRACT METADATA FIRST
    try:
        meta = _africacheck_extract_metadata(soup, url)
        print(f"[DEBUG] Extracted metadata with {len(meta)} fields")
        # Ensure language is included in metadata
        meta['language'] = lang_code # Use the passed lang_code
    except Exception as e:
        print(f"[WARNING] Metadata extraction failed: {e}")
        meta = {"extraction_error": str(e), "language": lang_code} # Ensure language is present even on metadata error

    # Step 2. BODY TEXT EXTRACTION
    def extract_body_text():
        """Extract, clean, and truncate body text with multiple fallback selectors."""

        # Define the text where you want to cut off the content
        cutoff_phrase = "You can republish the text of this article free of charge"

        body_selectors = [
            "article",
            ".content-body",
            ".article-content",
            ".post-content",
            ".entry-content",
            "main"
        ]

        for selector in body_selectors:
            element = soup.select_one(selector)
            if element:
                # 1. Extract and perform initial cleaning
                text = element.get_text(" ", strip=True)
                cleaned_text = text.replace('\xa0', ' ').replace('\u200b', '')

                # 2. Split the text at the cutoff phrase and keep the first part
                # The .split(cutoff_phrase)[0] gets everything before the phrase.
                # The .strip() removes any trailing whitespace.
                final_text = cleaned_text.split(cutoff_phrase)[0].strip()

                if len(final_text) > 100:  # Ensure substantial content remains
                    return final_text

        # Final fallback for the whole page if no specific element is found
        fallback_text = soup.get_text(" ", strip=True) if soup else ""
        cleaned_fallback = fallback_text.replace('\xa0', ' ').replace('\u200b', '')

        # Apply the same cutoff logic to the fallback text
        final_fallback_text = cleaned_fallback.split(cutoff_phrase)[0].strip()

        return final_fallback_text

    # Your existing code to call the function
    try:
        # This function now returns the truncated text
        body_text = extract_body_text()
        print(f"[DEBUG] Extracted body text: {len(body_text)} characters")
    except Exception as e:
        print(f"[WARNING] Body extraction failed: {e}")
        body_text = ""

    # Step 3. CLAIM EXTRACTION
    def extract_claim():
        """Enhanced claim extraction with multiple strategies"""

        # Strategy 1: Look for claim-specific elements first
        claim_selectors = [
            '.claim',                    # Direct claim class
            '.fact-check-claim',         # Fact-check specific claim
            '.statement',               # Statement being checked
            '[data-claim]',             # Data attribute
            '.quote',                   # Quoted claim
            '.field--name-field-claim', # Field-specific claim
            'blockquote'                # Often claims are in blockquotes
        ]

        for selector in claim_selectors:
            try:
                element = soup.select_one(selector)
                if element:
                    claim_text = element.get_text(strip=True)
                    if len(claim_text) > 10:  # Ensure substantial claim
                        print(f"[DEBUG] Claim found via selector '{selector}'")
                        return claim_text
            except Exception:
                continue

        # Strategy 2: Extract from meta tags
        claim_meta_selectors = [
            'meta[name="claim"]',
            'meta[property="claim"]',
            'meta[name="description"]'  # Sometimes claim is in description
        ]

        for selector in claim_meta_selectors:
            try:
                meta_elem = soup.select_one(selector)
                if meta_elem and meta_elem.get('content'):
                    content = meta_elem.get('content').strip()
                    if len(content) > 20:
                        print(f"[DEBUG] Claim found in meta tag")
                        return content
            except Exception:
                continue

        # Strategy 3: Check if claim is in extracted metadata
        if isinstance(meta, dict) and 'claim' in meta and meta['claim']:
            print(f"[DEBUG] Claim found in metadata")
            return meta['claim']

        # Strategy 4: H1 fallback
        try:
            if soup.h1:
                h1_text = soup.h1.get_text(strip=True)
                # Skip if H1 looks like a generic title rather than claim
                title_indicators = ['fact check:', 'analysis:', 'investigation:', 'report:']
                if not any(indicator in h1_text.lower() for indicator in title_indicators):
                    print(f"[DEBUG] Using H1 as claim")
                    return h1_text
        except Exception:
            pass

        # Strategy 5: Look for first substantial paragraph as last resort
        try:
            first_para = soup.select_one('article p, .content p, p')
            if first_para:
                para_text = first_para.get_text(strip=True)
                if 30 < len(para_text) < 300:
                    print(f"[DEBUG] Using first paragraph as claim")
                    return para_text
        except Exception:
            pass

        return "Claim not found"

    try:
        claim = extract_claim()
    except Exception as e:
        print(f"[WARNING] Claim extraction failed: {e}")
        claim = "Claim extraction failed"

    # Step 4. VERDICT EXTRACTION
    def extract_verdict():
        """Enhanced verdict extraction with multilingual support"""

        verdict_context = ""

        # Strategy 1: Enhanced meta tags
        verdict_meta_selectors = [
            'meta[name="verdict"]',
            'meta[property="verdict"]',
            'meta[name="rating"]',
            'meta[property="rating"]',
            'meta[name="fact-check-rating"]'
        ]

        for selector in verdict_meta_selectors:
            try:
                meta_elem = soup.select_one(selector)
                if meta_elem and meta_elem.get('content'):
                    verdict_text = meta_elem.get('content').strip()
                    if verdict_text:
                        normalized = _africacheck_normalize_verdict(verdict_text)
                        if normalized != "Unrated":
                            print(f"[DEBUG] Verdict found in meta tag")
                            return normalized, f"Meta tag verdict: {verdict_text}"
            except Exception:
                continue

        # Strategy 2: Enhanced verdict elements
        verdict_selectors = [
            '.verdict', '.rating', '.fact-check-verdict', '.verdict-box',
            '.verdict-rating__rate',  # Original selector
            '.rating-result', '.fact-rating',
            '.field--name-field-verdict', '.verdict-text', '.rating-label',
            '[data-verdict]', '[data-rating]',
            '.ac-rating strong', # Specific Africa Check element
            '.factcheck-rating strong' # Another potential Africa Check element
        ]

        for selector in verdict_selectors:
            try:
                element = soup.select_one(selector)
                if element:
                    verdict_text = element.get_text(strip=True)
                    if verdict_text and len(verdict_text) < 100:  # Reasonable verdict length
                        normalized = _africacheck_normalize_verdict(verdict_text)
                        if normalized != "Unrated":
                            print(f"[DEBUG] Verdict found via element '{selector}'")
                            return normalized, f"Element verdict: {verdict_text}"
            except Exception:
                continue

        # Strategy 3: Check if verdict is in extracted metadata
        if isinstance(meta, dict) and 'verdict' in meta and meta['verdict']:
            try:
                normalized = _africacheck_normalize_verdict(meta['verdict'])
                if normalized != "Unrated":
                    print(f"[DEBUG] Verdict found in metadata")
                    return normalized, f"Metadata verdict: {meta['verdict']}"
            except Exception:
                pass

        # Strategy 4: Enhanced conclusion patterns (search within body text)
        conclusion_patterns = [
            "the verdict:", "our verdict:", "verdict:", "rating:",
            "in conclusion:", "conclusion:", "our rating:",
            "fact check verdict:", "our analysis:", "the rating:",
            "final verdict:", "overall:", "assessment:"
        ]

        body_lower = body_text.lower()
        for pattern in conclusion_patterns:
            if pattern in body_lower:
                verdict_section = body_lower.split(pattern, 1)[-1]
                # Extract first sentence or limited text after pattern
                first_sentence = verdict_section.strip().split('.')[0]
                if not first_sentence:
                    first_sentence = verdict_section.strip().split('!')[0]
                if not first_sentence:
                    first_sentence = verdict_section.strip().split('\n')[0]

                if first_sentence and len(first_sentence) > 5:
                    normalized = _africacheck_normalize_verdict(first_sentence)
                    if normalized != "Unrated":
                        return normalized, f"Conclusion pattern: {pattern} - {first_sentence[:100]}"


        # Strategy 5: Look for verdict in headings
        try:
            heading_selectors = ['h2', 'h3', 'h4', 'h5']
            for selector in heading_selectors:
                headings = soup.select(selector)
                for heading in headings:
                    heading_text = heading.get_text(strip=True).lower()
                    if any(word in heading_text for word in ['verdict', 'rating', 'conclusion']):
                        # Look for verdict in next sibling or within the heading itself
                        next_elem = heading.find_next_sibling()
                        if next_elem:
                            verdict_text = next_elem.get_text(strip=True)
                            normalized = _africacheck_normalize_verdict(verdict_text)
                            if normalized != "Unrated":
                                print(f"[DEBUG] Verdict found via heading's sibling")
                                return normalized, verdict_text[:200].strip()
                        # Check the heading text itself
                        normalized = _africacheck_normalize_verdict(heading_text)
                        if normalized != "Unrated":
                            print(f"[DEBUG] Verdict found in heading text")
                            return normalized, heading_text[:200].strip()

        except Exception:
            pass

        return "Unrated", "No verdict found using any strategy"

    try:
        verdict, vctx = extract_verdict()
    except Exception as e:
        print(f"[WARNING] Verdict extraction failed: {e}")
        verdict, vctx = "Unrated", f"Extraction failed: {e}"

    # Step 5. DOMAIN CLASSIFICATION
    try:
        # tags = meta.get("tags", []) if isinstance(meta, dict) else []
        categories = meta.get("categories", []) if isinstance(meta, dict) else []
        domain = classify_domain(categories, url, claim)
        print(f"[DEBUG] Domain classified as: {domain}")
    except Exception as e:
        print(f"[WARNING] Domain classification failed: {e}")
        domain = "Other"

    # Step 6. ENHANCED LINK EXTRACTION
    def extract_links():
        """Extract social links with enhanced error handling"""
        try:
            links = []
            for a in soup.select("article a[href], .content a[href]"):
                href = a.get("href")
                if href and is_social(href):
                    links.append(href)
            return links
        except Exception as e:
            print(f"[WARNING] Link extraction failed: {e}")
            return []

    links = extract_links()

    # Step 7. UPDATE METADATA AND BUILD RECORD
    try:
        # Safely update metadata
        if isinstance(meta, dict):
            meta["verdict_context"] = vctx
            # meta["language"] is already set in _africacheck_extract_metadata
            meta["parsing_method"] = "enhanced_extraction"
        else:
            # This case should be less likely now with the initial meta extraction handling
            meta = {
                "verdict_context": vctx,
                "language": lang_code, # Fallback language
                "parsing_method": "enhanced_extraction_fallback"
            }
    except Exception as e:
        print(f"[WARNING] Metadata update failed: {e}")

    print(f"[DEBUG] Extracted Claim: {claim[:100]}...")
    print(f"[DEBUG] Extracted Verdict: {verdict}")
    print(f"[DEBUG] Language: {meta.get('language', 'Unknown')}, Domain: {domain}") # Print language from meta
    print(f"[DEBUG] Links found: {len(links)}")

    # Step 8. BUILD AND RETURN RECORD
    try:
        # Use the language from the extracted meta or the passed lang_code as fallback
        record_language = meta.get('language', lang_code)
        record = build_record("Africa Check-NG", url, claim, verdict, links, meta, body_text, domain, claim_language=record_language)
        return [record]
    except Exception as e:
        print(f"[ERROR] Record building failed: {e}")
        # Fallback record structure
        return [{
            "error": f"Record building failed: {e}",
            "url": url,
            "claim": claim,
            "verdict": verdict,
            "domain": domain,
            "language": meta.get('language', lang_code if lang_code else "en") # Use language from meta or fallback
        }]

##### Platform: The FACTCHECKHUB **(factcheckhub.com)**

In [None]:
# --- HELPER FUNCTIONS (for FactCheckHub Parser) ---
def _fch_normalize_verdict(text: str) -> str:
    """Normalizes FactCheckHub verdict text to a standard category."""
    text_lower = text.lower().strip()
    verdict_map = {
        "false": "False", "true": "True", "mostly true": "Mostly True",
        "mostly false": "Mostly False", "half true": "Half True",
        "misleading": "Misleading", "miscaptioned": "Misleading",
        "unproven": "Unproven", "inconclusive": "Unproven",
        # Add Hausa keywords
        "karya": "False", "gaskiya": "True", "rabin gaskiya": "Half True",
        "bata-gari": "Misleading", "bata gari": "Misleading",
        "labarin karya": "False", "labaran karya": "False",
        "gaskiyar lamari": "True", "ba gaskiya ba": "False",
        "karya ne": "False", "gaskiya ne": "True",
        "BINCIKE": "Investigated", "An binciki": "Investigated",
        "sahih": "True", "banza": "False"
    }
    for key, value in verdict_map.items():
        if key in text_lower:
            return value
    return "Unrated"


def _fch_extract_author_info(soup):
    """Extracts comprehensive author information from FactCheckHub articles."""
    author_info = {
        "name": None,
        "email": None,
        "twitter": None,
        "bio": None,
        "profile_url": None,
        "posts_count": None
    }

    # Try multiple selectors for author information
    author_selectors = [
        ".td-post-author-name a",
        ".author-name a",
        ".entry-author-display-name",
        "a[rel='author']",
        ".td-author-name a"
    ]

    author_element = None
    for selector in author_selectors:
        author_element = soup.select_one(selector)
        if author_element:
            break

    if author_element:
        author_info["name"] = author_element.get_text(strip=True)
        author_info["profile_url"] = urljoin("https://factcheckhub.com", author_element.get("href", ""))

    # Alternative: look for author in byline or post meta
    if not author_info["name"]:
        # Try to find author in byline patterns
        byline_patterns = [
            r"by\\s+([A-Za-z\\s]+?)(?:\\s*\\||$)",
            r"author:\\s*([A-Za-z\\s]+?)(?:\\s*\\||$)",
            r"written\\s+by\\s+([A-Za-z\\s]+?)(?:\\s*\\||$)"
        ]

        text_content = soup.get_text()
        for pattern in byline_patterns:
            match = re.search(pattern, text_content, re.IGNORECASE)
            if match:
                author_info["name"] = match.group(1).strip()
                break

    # Extract email if present in author bio section
    author_bio_element = soup.select_one(".td-author-description, .author-bio, .author-description")
    if author_bio_element:
        author_info["bio"] = author_bio_element.get_text(strip=True)
        # Look for email in bio
        email_match = re.search(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,})', author_info["bio"])
        if email_match:
            author_info["email"] = email_match.group(1)

        # Look for Twitter handle
        twitter_match = re.search(r'@([A-Za-z0-9_]+)', author_info["bio"])
        if twitter_match:
            author_info["twitter"] = twitter_match.group(1)

    return author_info


def _fch_extract_metadata(soup):
    """Extracts enhanced metadata for FactCheckHub articles."""
    metadata = {
        "image_urls": [urljoin("https://factcheckhub.com", img.get("src", ""))
                      for img in soup.select(".td-post-content img[src], .entry-content img[src]")
                      if img.get("src")],
        "country": "NG",
        "published_at": None,
        "tags": [],
        "categories": [],
        "reading_time": None,
        "verdict_context": "",
        "language_detected": "en"  # Default to English
    }

    # Extract publish date with multiple selectors
    date_selectors = [
        "time.entry-date",
        ".td-post-date time",
        ".published-date",
        "time[datetime]",
        ".entry-date"
    ]

    for selector in date_selectors:
        date_element = soup.select_one(selector)
        if date_element:
            datetime_attr = date_element.get("datetime")
            if datetime_attr:
                metadata["published_at"] = datetime_attr
                break
            # Fallback to text content parsing
            date_text = date_element.get_text(strip=True)
            if date_text:
                # Try to parse various date formats
                try:
                    from datetime import datetime
                    # Common formats used by FactCheckHub
                    date_formats = [
                        "%B %d, %Y",  # August 21, 2025
                        "%Y-%m-%d",   # 2025-08-21
                        "%d/%m/%Y",   # 21/08/2025
                    ]
                    for fmt in date_formats:
                        try:
                            parsed_date = datetime.strptime(date_text, fmt)
                            metadata["published_at"] = parsed_date.isoformat()
                            break
                        except ValueError:
                            continue
                except ImportError:
                    pass
                break

    # Extract tags and categories
    tag_selectors = [
        "a[rel='category tag']",
        ".td-post-category a",
        ".entry-categories a",
        ".post-categories a"
    ]

    for selector in tag_selectors:
        tags = soup.select(selector)
        if tags:
            metadata["tags"] = [tag.get_text(strip=True) for tag in tags]
            break

    # Extract categories (broader classification)
    category_selectors = [
        ".td-category a",
        ".entry-category a",
        ".post-category a"
    ]

    for selector in category_selectors:
        categories = soup.select(selector)
        if categories:
            metadata["categories"] = [cat.get_text(strip=True) for cat in categories]
            break

    # Extract reading time if available
    reading_time_element = soup.select_one(".reading-time, .read-time, .eta")
    if reading_time_element:
        reading_time_text = reading_time_element.get_text(strip=True)
        time_match = re.search(r'(\\d+)', reading_time_text)
        if time_match:
            metadata["reading_time"] = int(time_match.group(1))

    # Detect language based on content
    body_text = soup.get_text()
    if any(word in body_text.lower() for word in ["gaskiya", "karya", "bincike", "labarin"]):
        metadata["language_detected"] = "ha"  # Hausa
    elif any(word in body_text.lower() for word in ["òótọ́", "irọ́", "èké"]):
        metadata["language_detected"] = "yo"  # Yoruba
    elif any(word in body_text.lower() for word in ["eziokwu", "ụgha", "nduhie"]):
        metadata["language_detected"] = "ig"  # Igbo

    return metadata


# --- ENHANCED CATEGORY-BASED ITERATOR ---
def iter_fch_article_urls_by_category(name: str, max_pages: int, cache: dict, categories=None):
    """
    Enhanced iterator that crawls FactCheckHub by categories for better pagination.

    Args:
        name: Parser identifier
        max_pages: Maximum pages to crawl per category
        cache: Cache dictionary for state management
        categories: List of categories to crawl. If None, uses default categories.
    """
    root = "https://factcheckhub.com"

    # Default categories based on FactCheckHub structure
    if categories is None:
        categories = [
            "general", "news", "politics", "elections", "economy",
            "security", "health", "government-policy", "covid-19", "insights"
        ]

    cache_key = f"{name}_category_state"
    category_states = cache.get(cache_key, {})

    for category in categories:
        print(f"[DEBUG] Processing category: {category}")

        # Ensure the state for this category is a dictionary
        if not isinstance(category_states.get(category), dict):
             category_states[category] = {"page": 1, "empty_pages": 0}

        state = category_states[category]
        current_page = state["page"]
        consecutive_empty_pages = state["empty_pages"]
        max_empty_pages = 3 # Stop after 3 consecutive empty pages

        pages_crawled = 0

        # Crawl pages for this category
        for page in range(current_page, current_page + max_pages):
            if pages_crawled >= max_pages:
                break
            if consecutive_empty_pages >= max_empty_pages:
                print(f"[INFO] Stopping pagination for {category} after {max_empty_pages} consecutive empty pages.")
                break

            category_url = f"{root}/category/{category}/page/{page}/"
            print(f"[DEBUG] Fetching {category} category page: {page} - {category_url}")

            try:
                soup = get_soup(category_url)
                # Reset empty page counter on successful fetch
                consecutive_empty_pages = 0

                # Update cache state
                state["page"] = page + 1
                state["empty_pages"] = consecutive_empty_pages
                category_states[category] = state
                cache[cache_key] = category_states
                save_cache(cache)

            except requests.exceptions.HTTPError as e:
                 if e.response.status_code == 404:
                     print(f"[INFO] Page {page} not found (404) for {category}. Incrementing empty page counter.")
                     consecutive_empty_pages += 1
                     state["empty_pages"] = consecutive_empty_pages
                     category_states[category] = state
                     cache[cache_key] = category_states
                     save_cache(cache)
                     if consecutive_empty_pages >= max_empty_pages:
                         print(f"[INFO] Stopping pagination for {category} after {max_empty_pages} consecutive 404 pages.")
                         break
                     continue # Skip to next page if 404
                 else:
                    print(f"[ERROR] Could not fetch {category} page {page}: {e}")
                    # If we hit an error, try a few more pages before giving up
                    if pages_crawled == 0:  # First page failed
                        break
                    continue

            except Exception as e:
                print(f"[ERROR] Could not fetch {category} page {page}: {e}")
                # If we hit an error, try a few more pages before giving up
                if pages_crawled == 0:  # First page failed
                    break
                continue

            urls_found = 0

            # Enhanced selectors for different page layouts
            article_selectors = [
                "h3.entry-title a",
                ".td-module-thumb a",
                ".item-details > h3 a",
                ".td-module-title a",
                ".entry-title a",
                ".post-title a",
                "h2.entry-title a",
                ".td-block-span12 .td-module-title a"
            ]

            for selector in article_selectors:
                links = soup.select(selector)
                for link in links:
                    href = link.get("href")
                    title = link.get_text(strip=True)

                    if href and not (VIDEO_REGEX.search(title) or "/video" in href):
                        url = urljoin(root, href)
                        urls_found += 1
                        yield {
                            "url": url,
                            "title": title,
                            "category": category,
                            "page": page
                        }

            pages_crawled += 1

            # If no articles found, but no 404, still increment empty page counter
            if urls_found == 0:
                consecutive_empty_pages += 1
                state["empty_pages"] = consecutive_empty_pages
                category_states[category] = state
                cache[cache_key] = category_states
                save_cache(cache)
                print(f"[INFO] No articles found on {category} page {page}. Empty pages: {consecutive_empty_pages}/{max_empty_pages}.")
                if consecutive_empty_pages >= max_empty_pages:
                    print(f"[INFO] Stopping pagination for {category} after {max_empty_pages} consecutive empty pages.")
                    break

            print(f"[DEBUG] Found {urls_found} articles on {category} page {page}")


def iter_fch_article_urls(name: str, max_pages: int, cache: dict):
    """
    Backwards compatible wrapper that uses the new category-based approach.
    This maintains API compatibility while using the improved strategy.
    """
    for article_data in iter_fch_article_urls_by_category(name, max_pages, cache):
        yield article_data["url"]


def parse_fch(url: str, lang_code: str = "en"):
    """Enhanced FactCheckHub parser with improved author extraction and metadata."""
    print(f"[DEBUG] Parsing FactCheckHub article: {url} (Language hint: {lang_code})")

    try:
        soup = get_soup(url)
    except Exception as e:
        print(f"[ERROR] Failed to fetch {url}: {e}")
        return []

    # Extract main article content
    article_content = soup.select_one(".td-post-content, .entry-content, .post-content")
    body_text = article_content.get_text(" ", strip=True) if article_content else ""

    if not body_text:
        print(f"[WARNING] No content found for {url}")
        return []

    # Extract claim with improved patterns
    claim = ""
    claim_patterns = [
        r"CLAIM[:\\s–-]+(.*?)(?:THE VERDICT|THE FINDINGS|VERIFICATION|FACT CHECK)",
        r"CLAIM[:\\s–-]+(.*?)(?:\\n\\s*\\n|\\n.*?:)",  # Claim followed by new line
        r"(?:^|\\n)\\s*CLAIM[:\\s–-]+(.*?)(?:\\n|\\.|$)"
    ]

    for pattern in claim_patterns:
        claim_match = re.search(pattern, body_text, re.DOTALL | re.IGNORECASE)
        if claim_match:
            claim = claim_match.group(1).strip()
            break

    # Fallback to title if no claim found
    if not claim:
        title_element = soup.select_one("h1.entry-title, h1.post-title, h1")
        claim = title_element.get_text(strip=True) if title_element else "Claim not found"

    # Extract tags for domain classification
    tags = [t.get_text(strip=True) for t in soup.select("a[rel='category tag']")]
    domain = classify_domain(tags, url, claim)

    # Enhanced verdict extraction
    verdict = "Unrated"
    vctx = ""

    # Multi-language verdict extraction
    if lang_code != "en":
        lrl_verdict_keywords = {
            "ha": {
                # Hausa verdict keywords
                "Gaskiya": "True", "Karya": "False", "Rabin Gaskiya": "Half True",
                "Bata-gari": "Misleading", "Bata gari": "Misleading",
                "labarin karya": "False", "labaran karya": "False",
                "gaskiyar lamari": "True", "ba gaskiya ba": "False",
                "karya ne": "False", "gaskiya ne": "True",
                "BINCIKE": "Investigated", "An binciki": "Investigated",
                "sahih": "True", "banza": "False"
            },
            "yo": {
                # Yoruba verdict keywords
                "Òótọ́": "True", "Irọ́": "False", "Òótọ́-òótọ́": "Mostly True",
                "Èké": "Misleading", "òòtọ́": "True", "irọ́": "False",
                "eke": "Misleading", "iro": "False", "otito": "True"
            },
            "ig": {
                # Igbo verdict keywords
                "Eziokwu": "True", "Ụgha": "False", "N'ụzọ dị ukwuu eziokwu": "Mostly True",
                "Nduhie": "Misleading", "eziokwu": "True", "ụgha": "False",
                "ugha": "False", "ezigbo eziokwu": "True"
            }
        }

        # Enhanced LRL verdict extraction
        if lang_code in lrl_verdict_keywords:
            # Strategy 1: Look for structured verdict sections
            verdict_section_patterns = [
                r"(?:THE VERDICT|VERDICT|HUKUNCI|HUKUNCIN|CONCLUSION)[:\\s–-]+(.*?)(?:\\.|\\n|$)",
                r"(?:FINDINGS|SAKAMAKON BINCIKE|BINCIKE|RESULT)[:\\s–-]+(.*?)(?:\\.|\\n|$)",
                r"(?:FINAL|CONCLUSION|KASHE)[:\\s–-]+(.*?)(?:\\.|\\n|$)"
            ]

            for pattern in verdict_section_patterns:
                verdict_match = re.search(pattern, body_text, re.IGNORECASE | re.DOTALL)
                if verdict_match:
                    verdict_text = verdict_match.group(1).strip()
                    for keyword, mapped_verdict in lrl_verdict_keywords[lang_code].items():
                        if re.search(rf"\\b{re.escape(keyword)}\\b", verdict_text, re.IGNORECASE):
                            verdict = mapped_verdict
                            vctx = verdict_text[:200].strip()
                            break
                    if verdict != "Unrated":
                        break

            # Strategy 2: Context-aware keyword search
            if verdict == "Unrated":
                body_lower = body_text.lower()
                for keyword, mapped_verdict in lrl_verdict_keywords[lang_code].items():
                    keyword_lower = keyword.lower()
                    if keyword_lower in body_lower:
                        # Get context around the keyword
                        keyword_pos = body_lower.find(keyword_lower)
                        context_start = max(0, keyword_pos - 100)
                        context_end = min(len(body_text), keyword_pos + len(keyword) + 100)
                        context = body_text[context_start:context_end].strip()

                        # Check if context suggests this is actually the verdict
                        if any(indicator in context.lower() for indicator in
                               ["verdict", "conclusion", "finding", "result", "hukunci", "sakamakon"]):
                            verdict = mapped_verdict
                            vctx = context
                            break

            # Strategy 3: Pattern-based inference for Hausa
            if verdict == "Unrated" and lang_code == "ha":
                body_lower = body_text.lower()
                if any(phrase in body_lower for phrase in
                       ["labaran karya", "karya ne", "ba gaskiya ba", "banza ne", "rudu ne"]):
                    verdict = "False"
                    vctx = "Inferred from Hausa false indicators"
                elif any(phrase in body_lower for phrase in
                         ["gaskiya ne", "gaskiyar lamari", "sahih ne", "daidai ne"]):
                    verdict = "True"
                    vctx = "Inferred from Hausa true indicators"
    else:
        # Enhanced English verdict extraction
        verdict_patterns = [
            # Look for structured verdict sections
            r"(?:THE VERDICT|VERDICT|CONCLUSION|RATING)[:\\s–-]+(.*?)(?:\\n|\\.|$)",
            r"(?:FINDINGS|FACT CHECK RESULT|RESULT)[:\\s–-]+(.*?)(?:\\n|\\.|$)",
            r"(?:^|\\n)\\s*(?:VERDICT|CONCLUSION)[:\\s–-]+(.*?)(?:\\n|\\.|$)"
        ]

        for pattern in verdict_patterns:
            verdict_match = re.search(pattern, body_text, re.IGNORECASE | re.DOTALL)
            if verdict_match:
                verdict_text = verdict_match.group(1).strip()
                vctx = verdict_text[:300].strip()
                verdict = _fch_normalize_verdict(verdict_text)
                if verdict != "Unrated":
                    break

        # Alternative: Look for verdict in specific HTML elements
        if verdict == "Unrated":
            verdict_elements = soup.select("h3, h4, strong, b, .verdict, .conclusion")
            for element in verdict_elements:
                element_text = element.get_text(strip=True)
                if any(keyword in element_text.upper() for keyword in ["VERDICT", "CONCLUSION", "RATING"]):
                    next_element = element.find_next(['p', 'div', 'span'])
                    if next_element:
                        verdict_text = next_element.get_text(strip=True)
                        vctx = verdict_text[:300].strip()
                        verdict = _fch_normalize_verdict(verdict_text)
                        if verdict != "Unrated":
                            break

    print(f"[DEBUG] Extracted Claim: {claim[:100]}...")
    print(f"[DEBUG] Extracted Verdict: {verdict}")

    # Enhanced metadata extraction
    meta = _fch_extract_metadata(soup)
    meta["verdict_context"] = vctx

    # Extract author information
    author_info = _fch_extract_author_info(soup)
    meta["author"] = author_info

    # Extract social media links
    links = []
    social_selectors = [
        ".td-post-content a[href*='twitter.com']",
        ".td-post-content a[href*='facebook.com']",
        ".td-post-content a[href*='instagram.com']",
        ".td-post-content a[href*='whatsapp.com']",
        ".td-post-content a[href*='telegram.me']",
        ".td-post-content a[href*='t.me']",
        ".entry-content a[href*='twitter.com']",
        ".entry-content a[href*='facebook.com']"
    ]

    for selector in social_selectors:
        social_links = soup.select(selector)
        for link in social_links:
            href = link.get("href")
            if href and is_social(href):
                links.append(href)

    # Build and return the record
    record = build_record(
        "FactCheckHub",
        url,
        claim,
        verdict,
        links,
        meta,
        body_text,
        domain,
        claim_language=lang_code
    )

    return [record]


# --- ALTERNATIVE SPECIALIZED FUNCTIONS ---

def iter_fch_by_author(author_slug: str, max_pages: int, cache: dict):
    """Iterate through articles by a specific author."""
    root = "https://factcheckhub.com"
    cache_key = f"fch_author_{author_slug}"
    start_page = cache.get(cache_key, 1)

    for page in range(start_page, start_page + max_pages):
        author_url = f"{root}/author/{author_slug}/page/{page}/"
        print(f"[DEBUG] Fetching author page: {author_url}")

        try:
            soup = get_soup(author_url)
            cache[cache_key] = page + 1
            save_cache(cache)
        except Exception as e:
            print(f"[ERROR] Could not fetch author page {page}: {e}")
            break

        urls_found = 0
        for link in soup.select("h3.entry-title a, .td-module-title a"):
            href = link.get("href")
            if href:
                url = urljoin(root, href)
                urls_found += 1
                yield url

        if urls_found == 0:
            break


def get_fch_categories():
    """Get available categories from FactCheckHub."""
    root = "https://factcheckhub.com"
    try:
        soup = get_soup(root)
        categories = []

        # Look for category links in navigation or footer
        category_links = soup.select("a[href*='/category/']")
        for link in category_links:
            href = link.get("href", "")
            if "/category/" in href:
                category = href.split("/category/")[-1].split("/")[0]
                if category and category not in categories:
                    categories.append(category)

        return categories
    except Exception as e:
        print(f"[ERROR] Could not fetch categories: {e}")
        return ["general", "news", "politics", "elections", "economy", "security", "health"]

##### Low-Resource Languages collector.

*this will help us ensure we have data for some claims made in low-resource-languages with at least 30 rows of them in the datasets.*

In [None]:
LRL_URL_PATTERNS = {
    "Dubawa": {
        "ha": "https://dubawa.org/category/african-languages/hausa/",
        "yo": "https://dubawa.org/category/african-languages/yoruba/",
        "ig": "https://dubawa.org/category/african-languages/igbo/",
    },
    "Africa Check-NG": {},
    # "FactCheckHub": {
    #     "ha": "https://factcheckhub.com/hausa-posts/",
    #     "yo": "https://factcheckhub.com/yoruba-posts/",
    #     "ig": "https://factcheckhub.com/igbo-posts/",
    # },
    "FactCheckHub": {}
}

def collect_lrl_articles(platform: str, max_pages: int = 20, cache: dict = None):
    """Collect LRL articles with proper validation and error handling."""
    patterns = LRL_URL_PATTERNS.get(platform, {})
    CACHE = cache if cache else load_cache()
    lrl_state = CACHE.setdefault('lrl_iter_state', {})
    print(f"[INFO] Loaded LRL iterator state: {lrl_state}")

    if not patterns:
        print(f"[INFO] No LRL patterns configured for {platform}")
        return

    for lang_code, base in patterns.items():
        print(f"[LRL] Starting collection for {platform} - {lang_code}")

        # Test base URL first
        try:
            test_soup = get_soup(base)
            if not test_soup or len(test_soup.get_text(strip=True)) < 100:
                print(f"[SKIP] Base URL appears empty or invalid: {base}")
                continue
        except Exception as e:
            print(f"[SKIP] Base URL failed: {base} - {e}")
            continue

        consecutive_empty_pages = 0
        max_empty_pages = 2  # Stop after 2 consecutive empty pages

        state_key = f"{platform.lower()}_{lang_code}"

        # Determine the start page by reading from our lrl_state dictionary.
        # It defaults to 0 if the key is not found, so we correctly start at page 1.
        start_page = lrl_state.get(state_key, 0) + 1

        for p in range(start_page, start_page + max_pages):
            if platform == "Dubawa":
                url = f"{base}page/{p}/"
            elif platform == "FactCheckHub":
                url = f"{base}?page={p}"
            else:
                url = f"{base}page/{p}/"

            print(f"[LRL] Fetching {platform} {lang_code} page {p}: {url}")

            try:
                soup = get_soup(url)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:
                    print(f"[INFO] Page {p} not found (404) - trying alternative pagination")
                    if platform == "FactCheckHub" and "?page=" in url:
                        alt_url = f"{base}page/{p}/"
                        try:
                            soup = get_soup(alt_url)
                            url = alt_url
                        except:
                            print(f"[INFO] Alternative pagination also failed for {lang_code} page {p}")
                            consecutive_empty_pages += 1
                            if consecutive_empty_pages >= max_empty_pages: break
                            continue
                    else:
                        consecutive_empty_pages += 1
                        if consecutive_empty_pages >= max_empty_pages: break
                        continue
                else:
                    print(f"[ERROR] HTTP error for {url}: {e}")
                    continue
            except Exception as e:
                print(f"[ERROR] Failed to fetch {url}: {e}")
                continue

            article_links = set()
            if platform == "Dubawa":
                for a in soup.select("h2.post-title a[href]"): article_links.add(a["href"])
            elif platform == "FactCheckHub":
                for a in soup.select("h2 a[href], h3 a[href], .post-title a[href]"): article_links.add(a["href"])
                for article in soup.select("article"):
                    link = article.find("a", href=True)
                    if link: article_links.add(link["href"])

            absolute_links = []
            for link in article_links:
                if not link.startswith('http'):
                    root = '/'.join(base.split('/')[:3])
                    link = urljoin(root, link)
                absolute_links.append(link)

            print(f"[LRL] Found {len(absolute_links)} article links on page {p}")

            if not absolute_links:
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= max_empty_pages:
                    print(f"[INFO] {consecutive_empty_pages} consecutive empty pages, stopping {lang_code}")
                    break
                continue
            else:
                consecutive_empty_pages = 0

            # Save the current page number before processing articles
            lrl_state[state_key] = p
            save_cache(CACHE)

            for link in absolute_links:
                try:
                    if not link or link == base: continue
                    parser = PLATFORM_PARSERS[platform]
                    records = parser(link, lang_code=lang_code)
                    for record in records:
                        if "language_info" not in record:
                            record["language_info"] = {
                                "primary_language": lang_code,
                                "detected_lrls": [lang_code],
                                "is_code_switched": False
                            }
                        yield record # Yield each record individually

                except Exception as exc:
                    print(f"[WARN] LRL parse failed for {link}: {exc}")
                    continue

        print(f"[LRL] Finished collection for {platform} - {lang_code}")

### Pipeline

Phase 1: Regular crawl

Phase 2: Top-up LRL crawl if share in the dataset is too low.

In [None]:
import signal, sys, traceback
PLATFORM_CONFIGS = {
    "FactCheckHub": {"iter": iter_fch_article_urls, "parser": parse_fch},
    "Dubawa": {"iter": iter_dubawa_article_urls, "parser": parse_dubawa},
    "Africa Check-NG": {"iter": iter_africacheck_article_urls, "parser": parse_africacheck},
}
PLATFORM_PARSERS = {k:v["parser"] for k,v in PLATFORM_CONFIGS.items()}

def enhanced_language_detection(text:str):
    patterns = {
        "ha": r"\b(ba|da|ya|ta|na|za|mu|ku|su|wannan|wancan)\b",
        "yo": r"\b(ni|ti|si|bi|wa|fi|kí|dí|ré|lè)\b",
        "ig": r"\b(na|ga|ka|ma|nke|ndi|nwa|otu|aha|eze)\b",
    }
    primary = safe_detect(text, "")
    detected = [lg for lg,pat in patterns.items() if re.search(pat, text.lower())]
    return {
        "primary_language": primary,
        "detected_lrls": detected,
        "is_code_switched": bool(detected and primary == "en")
    }

def crawl(max_pages_per_platform:int=30, target_lrl_share:float=50.0):
    cache = load_cache()
    stop_flag = False

    def _graceful_exit(signum, frame):
        nonlocal stop_flag
        stop_flag = True
        print("[INFO] Interrupt received, saving state and exiting...")
        save_cache(cache)

    signal.signal(signal.SIGINT,  _graceful_exit)
    signal.signal(signal.SIGTERM, _graceful_exit)

    cf, metrics = ContentFilter(), LRLMetrics()

    # --- Phase 1: LRL-First Crawl ---
    # This phase prioritizes collecting articles from dedicated LRL sections.
    print("\n--- Phase 1: Starting LRL-First Crawl ---")
    for name in PLATFORM_CONFIGS:
        if stop_flag: break
        print(f"\n=== Collecting LRL articles for {name} ===")
        try:
            lrl_records_generator = collect_lrl_articles(name, max_pages=max_pages_per_platform, cache=cache)
            for rec in lrl_records_generator:
                if stop_flag: break
                url = rec.get("source_url") # Use .get() for safer access
                if not url or url in cache["crawled"] or url in cache["failed"]:
                    continue

                # Check if claim_text exists before passing it
                claim_text = rec.get("extracted_claim_text", "")
                if cf.should_include(claim_text, rec.get("article_body", ""), url):
                    rec["language_info"] = enhanced_language_detection(claim_text)
                    metrics.update(rec["language_info"])
                    write_outputs([rec], append_mode=True)
                    cache["crawled"].append(url)
                    cache.setdefault("crawled_records", []).append(rec['id'])
                    save_cache(cache) # Save state after a successful write
        except Exception as exc:
             print(f"[ERROR] LRL collection failed for {name}: {exc}")
             traceback.print_exc() # Print traceback for debugging

    # --- Phase 2: Other Content Crawl (Topping up with any of EN, YO, HA, IG) ---
    for name, cfg in PLATFORM_CONFIGS.items():
        if stop_flag: break
        print("\n--- Phase 2: Starting Eng Content Crawl for top-up ---")

        pages_to_crawl = max_pages_per_platform
        if name == "Africa Check-NG":
            pages_to_crawl *= 1
            print(f"[INFO] Multiplying pages for Africa Check-NG. Crawling up to {pages_to_crawl} pages.")

        print(f"\n=== Crawling main content for {name}: up to {pages_to_crawl} pages ===")

        for url in cfg["iter"](name, pages_to_crawl, cache):
            if stop_flag: break
            if url in cache["crawled"] or url in cache["failed"]: continue

            try:
                records = cfg["parser"](url)
                if not records:
                    cache["failed"][url] = "NoRecords"
                    continue

                for record in records:
                    # Check if claim_text exists before passing it
                    claim_text = record.get("extracted_claim_text", "")
                    if cf.should_include(claim_text, record.get("article_body", ""), url):
                        record["language_info"] = enhanced_language_detection(claim_text)
                        metrics.update(record["language_info"])
                        write_outputs([record], append_mode=True)
                        cache.setdefault("crawled_records", []).append(record['id'])
                    else:
                         print(f"[SKIP] Filtered out record from {url}")

                cache["crawled"].append(url) # Mark URL as fully processed
            except Exception as exc:
                cache["failed"][url] = f"{exc.__class__.__name__}: {exc}"
                print(f"[ERROR] Processing failed for {url}: {exc}")
                traceback.print_exc() # Print traceback for debugging
            finally:
                save_cache(cache) # Save state after each URL is processed

        print(f"--- Finished {name} ---")

    total_records_saved = len(cache.get("crawled_records", []))
    print(f"\n✓ Finished: {total_records_saved} total fact-checks saved – Final LRL share {metrics.share():.1f}%")

def write_outputs(items, append_mode=True):
    if not items:
        return

    print(f"[WRITE] Writing {len(items)} records...")
    jsonl_path = PERSISTENT_PATH / "factchecks.jsonl"
    csv_path = PERSISTENT_PATH / "factchecks.csv"

    existing_ids = set()
    if append_mode and jsonl_path.exists():
        with open(jsonl_path, "r", encoding="utf-8") as jfh:
            for line in jfh:
                try: existing_ids.add(json.loads(line.strip()).get("id"))
                except json.JSONDecodeError: continue

    with open(jsonl_path, "a" if append_mode else "w", encoding="utf-8") as jfh:
        for item in items:
            if item.get("id") not in existing_ids:
                jfh.write(json.dumps(item, ensure_ascii=False) + "\n")

    # --- Fields in crawled CSV data ---
    fields = ["id", "extracted_claim_text", "verdict", "domain", "claim_language",
              "claim_links", "claim_platforms",
              "source_url", "platform", "article_body", "metadata"]

    file_exists = csv_path.exists()
    with open(csv_path, "a", newline="", encoding="utf-8") as cfh:
        writer = csv.DictWriter(cfh, fieldnames=fields)
        if not file_exists or cfh.tell() == 0:
             writer.writeheader()
        for item in items:
            if item.get("id") not in existing_ids:
                row = {k: (json.dumps(v) if isinstance(v, (dict, list)) else v)
                       for k, v in item.items() if k in fields}
                writer.writerow(row)

### Execute

In [None]:
if __name__ == "__main__":
    # Call the crawl function directly with desired parameters
    print("Starting crawl...")
    crawl(80, 80.0)
    print("Crawl finished.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[DELAY] Original delay is 10.0s. Waiting for full duration.
Attempting to fetch: https://africacheck.org/fact-checks/meta-programme-fact-checks/no-video-does-not-show-protesters-burning-down-nyanya-police
Successfully fetched: https://africacheck.org/fact-checks/meta-programme-fact-checks/no-video-does-not-show-protesters-burning-down-nyanya-police
Response length: 107878
[DEBUG] Extracted metadata with 13 fields
[DEBUG] Extracted body text: 2949 characters
[DEBUG] Claim found in meta tag
[DEBUG] Domain classified as: Security
[DEBUG] Extracted Claim: Several social media posts from early August 2024 claim that protesters have burned down the Nyanya ...
[DEBUG] Extracted Verdict: Unrated
[DEBUG] Language: en, Domain: Security
[DEBUG] Links found: 7
Passed claim language: en
[SKIP] Filtered out record from https://africacheck.org/fact-checks/meta-programme-fact-checks/no-video-does-not-show-protesters-burning-down-nyanya-p