# SETUP

## Imports

In [1]:
import base64
import concurrent
import io
import json
import os
import re
import time
from typing import Optional

import ipynbname
import nltk
import numpy as np
import pandas as pd
import requests
import torch
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from googleapiclient.discovery import build
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from openpyxl import load_workbook
from openpyxl.drawing.spreadsheet_drawing import OneCellAnchor, TwoCellAnchor
from PIL import Image, ImageOps

In [2]:
# get .env keys
load_dotenv()
API_KEY = os.getenv("API_KEY")
CSE_ID = os.getenv("CSE_ID")

## Pipeline Configuration

In [3]:
CONFIG = {
    "OLLAMA_EMBED_MODEL": "qwen3-embedding:0.6b",
    "OLLAMA_LLM_MODEL": "qwen3:1.7b",
    "OLLAMA_VLM_MODEL": "qwen3-vl:2b",
    "SEARCH_NUM_RESULTS": 5,
    "RERANK_TOP_K": 2,
    "IMAGE_SIZE": (672, 672)
}

In [4]:
CACHE_DIR = os.path.join(os.getcwd(), "cache")
WEB_CACHE_FILE = os.path.join(CACHE_DIR, "web_content_cache.json")

## Ollama

In [5]:
# check for Ollama instance
for i in range(30):
    try:
        r = requests.get("http://localhost:11434/api/tags", timeout=1)
        if r.status_code == 200:
            print("Ollama served at http://localhost:11434/")
            break
    except Exception:
        time.sleep(1)
else:
    raise RuntimeError("Ollama failed to start.")

Ollama served at http://localhost:11434/


In [6]:
# pull models from https://ollama.com/library
# ! ollama pull qwen3-embedding:0.6b
# ! ollama pull qwen3:0.6b
# ! ollama pull qwen2.5vl:3b

# CLAIM PROCESSING


In [7]:
# download nltk resources
NLTK_DATA_DIR = os.path.join(os.getcwd(), "cache/nltk_data")
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
nltk.data.path.append(NLTK_DATA_DIR)
nltk.download("punkt", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("punkt_tab", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("stopwords", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("wordnet", quiet=True, download_dir=NLTK_DATA_DIR)

True

In [8]:
def query_ollama(payload, url="http://localhost:11434/api/generate", retries=3):
    """
    Get response from the Ollama server for LLM/VLM inference.
    """
    for attempt in range(retries):
        try:
            response = requests.post(url, json=payload, timeout=60)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            if attempt == retries - 1:
                print(f"Ollama failed after {retries} attempts: {e}")
                return {}
            time.sleep(2)

## Text Processing

In [9]:
def preprocess_text(claim: str) -> str:
    """
    Apply basic preprocessing to convert a claim into a keyword-based search query.
    """
    print(f"Generating query from text: '{claim}'")

    # normalization
    text = claim.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)

    # tokenization
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    lemmas = list(dict.fromkeys(lemmas))

    query = " ".join(lemmas)
    return query

## Image Processing

In [10]:
def preprocess_image(image_path: str, target_size: tuple = (672, 672)) -> str:
    """
    Preprocesses an image for VLM analysis with resizing and letterboxing.
    """
    try:
        img = Image.open(image_path).convert('RGB')

        # image resampling
        processed_img = ImageOps.pad(img, target_size, method=Image.Resampling.LANCZOS, color=(0, 0, 0), centering=(0.5, 0.5))

        base, ext = os.path.splitext(image_path)
        output_path = f"{base}_processed{ext}"
        processed_img.save(output_path, quality=95)
        return output_path
        
    except Exception as e:
        print(f"Error preprocessing image '{image_path}': {e}")
        return image_path

In [11]:
def caption(image_path: str, text_claim: Optional[str], model: str):
    """
    Generate descriptive text claims from an image claim. If text claim already exists, add more context from image.
    """
    print(f"Generating query from image: '{image_path}'")

    with open(image_path, "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode("utf-8")

    prompt = f"""
Transform this image/text claim into a concise fact-checking search query:

STEPS:
- Identify main entities (people, organizations, events)
- Remove question words (has, did, is, etc.)
- Remove opinions and emotional language
- Keep only factual core elements
- Join with spaces as a search phrase
- Only respond with the plain search query

TEXT CLAIM: "{text_claim}"

SEARCH QUERY:
"""

    response_json = query_ollama({"model": model, "prompt": prompt, "images": [image_b64], "stream": False})
    return response_json.get("response", "").strip()

# RETRIEVAL

In [12]:
def load_web_cache() -> dict[str, str]:
    """
    Load cached articles from directory.
    """
    os.makedirs(CACHE_DIR, exist_ok=True)
    if os.path.exists(WEB_CACHE_FILE):
        with open(WEB_CACHE_FILE, 'r', encoding='utf-8') as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                print("Warning: Web cache file is corrupted. Starting new cache.")
                return {}
    return {}

def save_web_cache(cache: dict[str, str]):
    """
    Save cached articles to directory.
    """
    try:
        with open(WEB_CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(cache, f, indent=4)
    except Exception as e:
        print(f"Error saving web cache: {e}")

In [13]:
def search(query: str, num_results: int) -> list[str]:
    """
    Retrieve URLs using Google Custom Search API.
    Return a list of string URLs.
    """
    print(f"Searching with query: '{query}'")

    service = build("customsearch", "v1", developerKey=API_KEY)
    res = service.cse().list(q=query, cx=CSE_ID, num=num_results).execute()
    urls = []
    for item in res.get("items", []):
        urls.append(item["link"])

    print(f"Found {len(urls)} URLs")
    return urls

In [14]:
def fetch_text(url: str, cache: dict[str, str], cache_update_flag: list[bool]) -> str:
    """
    Fetch article text from a given URL.
    Return the string body text from the HTML content.
    """    
    if url in cache:
        print(f"Fetching article with URL: {url} (CACHED)")
        return cache[url]
    
    print(f"Fetching article with URL: {url} (SCRAPING)")
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        r = requests.get(url, headers=headers, timeout=5)
        if r.status_code != 200:
            return ""
            
        soup = BeautifulSoup(r.text, "html.parser")
        for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
            script.decompose()

        content = soup.find('article') or soup.find('div', class_='entry-content') or soup.find('main') or soup
        text = content.get_text(separator=' ', strip=True)
        text = re.sub(r'\\s+', ' ', text)
        
        if text:
            cache[url] = text
            cache_update_flag[0] = True
            
        return text
    
    except Exception as e:
        print(f"    Error while fetching: {e}")
        return ""

In [15]:
def retrieve(urls: list[str]) -> list[tuple[str, str]]:
    """
    Retrieve documents from search results based on a query.
    """
    cache = load_web_cache()
    cache_update_flag = [False]
    def fetch_with_cache(url):
        return fetch_text(url, cache, cache_update_flag)
    
    documents = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(fetch_with_cache, url): url for url in urls}
        
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                text = future.result()
                if text:
                    documents.append((url, text))
            except Exception as e:
                print(f"    Error processing {url}: {e}")
    
    if cache_update_flag[0]:
        print("Saving updated web content cache...")
        save_web_cache(cache)
        
    return documents

# EMBEDDING AND RERANKING



## Embedding

In [16]:
def embed(
    query: str, documents: list[tuple[str, str]], model_name: str
) -> tuple[torch.Tensor, torch.Tensor, list[str], list[str]]:
    """
    Generate embeddings for a query and a list of documents using Ollama.
    Return query_embeddings, document_embeddings, urls, document_texts
    """
    if not documents:
        raise RuntimeError("No documents provided for embedding.")

    print(f"Embedding {len(documents)} documents...")

    def ollama_embed(text: str):
        try:
            res = requests.post(
                "http://localhost:11434/api/embeddings",
                json={"model": model_name, "prompt": text},
            )
            data = res.json()
            return data.get("embedding", [])
        except Exception as e:
            print(f"    Error getting embedding: {e}")
            return []

    # query embedding
    print("Generating query embedding...")
    query_vec = ollama_embed(query)
    if not query_vec:
        raise RuntimeError("Query embedding failed.")

    expected_dim = len(query_vec)
    print(f"Query embedding dimension: {expected_dim}")

    # document embeddings
    doc_vecs = []
    valid_urls = []
    valid_texts = []

    for url, text in documents:
        vec = ollama_embed(text)
        if not vec:
            print(f"    WARNING: Empty embedding for document. Skipping")
            continue
        if len(vec) != expected_dim:
            print(f"    WARNING: Embedding dimension mismatch ({len(vec)} vs {expected_dim}). Skipping.")
            continue
        doc_vecs.append(vec)
        valid_urls.append(url)
        valid_texts.append(text)

    if not doc_vecs:
        raise RuntimeError("No valid document embeddings generated.")
    print(f"Successfully embedded {len(doc_vecs)} documents.\n")

    query_embeddings = torch.tensor(np.array([query_vec]), dtype=torch.float32)
    document_embeddings = torch.tensor(np.array(doc_vecs), dtype=torch.float32)

    return query_embeddings, document_embeddings, valid_urls, valid_texts

## Reranking

In [17]:
def rerank(
    query_embeddings: Optional[torch.Tensor],
    document_embeddings: Optional[torch.Tensor],
    urls: list[str],
    document_texts: list[str],
    top_k: int = 3,
) -> list[tuple[str, str, float]]:
    """
    Rerank precomputed embeddings using cosine similarity.
    Return a list of (url, text, score) tuples sorted by relevance score.
    """
    if query_embeddings is None or document_embeddings is None:
        raise RuntimeError("Query or document embeddings not found.")
    print(f"Reranking {len(document_texts)} documents...")

    query_norm = query_embeddings / query_embeddings.norm(dim=1, keepdim=True)
    doc_norms = document_embeddings / document_embeddings.norm(dim=1, keepdim=True)

    scores = torch.mm(query_norm, doc_norms.T)[0].cpu().numpy()

    ranked = list(zip(urls, document_texts, scores))
    ranked.sort(key=lambda x: x[2], reverse=True)

    return ranked[:top_k]

# PREMISE GENERATION

In [18]:
def generate_premise(claim: str, documents: list[str], model: str):
    """
    Summarize the evidence retrieved for a claim into a short premise.
    """
    joined_documents = "\n".join([f"- {document}" for document in documents])

    prompt = f"""
You are a factual summarization assistant. Your task is to extract and summarize ONLY the factual content from the provided documents to create a premise for fact-checking.

STEPS:
- Read the claim and the supporting documents carefully
- Extract ONLY factual information from the documents that are relevant to verifying the claim
- Summarize these facts concisely into a single premise
- DO NOT include any analysis, conclusions, or opinions
- DO NOT reference the documents themselves or use phrases like "according to the articles"
- Present only the factual premise

CLAIM: "{claim}"

DOCUMENTS:
{joined_documents}

OUTPUT FORMAT:
Provide only the factual premise without any introductory text, bullet points, or numbering.
"""

    response_json = query_ollama({"model": model, "prompt": prompt, "stream": False})
    return response_json.get("response", "").strip()

# PIPELINE

In [19]:
def pipeline(text_claim: str, image_path: Optional[str]) -> list[tuple[str, str, float]]:
    """
    Complete IR pipeline: retrieve, rerank, and return top documents.
    Return a list of (url, text, score) tuples for top_k most relevant documents.
    """
    # parameters
    ollama_emb_name = CONFIG["OLLAMA_EMBED_MODEL"]
    ollama_llm_name = CONFIG["OLLAMA_LLM_MODEL"]
    ollama_vlm_name = CONFIG["OLLAMA_VLM_MODEL"]
    num_results = CONFIG["SEARCH_NUM_RESULTS"]
    top_k = CONFIG["RERANK_TOP_K"]
    image_size = CONFIG["IMAGE_SIZE"]

    # preprocessing
    if image_path:
        # image preprocessing
        processed_image_path = preprocess_image(image_path, image_size)
        final_image_path = processed_image_path if processed_image_path else image_path

        # image captioning
        image_claim = caption(final_image_path, text_claim, ollama_vlm_name)

        # vlm postprocessing
        query = preprocess_text(image_claim)
        
        if not text_claim.strip():
            text_claim = image_claim # if image only, use query as text claim
    else:
        # text preprocessing
        query = preprocess_text(text_claim)

    # retrieval
    urls = search(query, num_results)

    documents = retrieve(urls)
    if not documents:
        premise = "No documents related to the claim were found."
        return premise, []

    # embedding
    claim_embeddings, document_embeddings, urls, document_texts = embed(text_claim, documents, ollama_emb_name)

    # reranking
    best_docs = rerank(claim_embeddings, document_embeddings, urls, document_texts, top_k)

    # optional: show reranked document contents
    for url, text, score in best_docs:
        print(f"{url}\nScore: {score:.3f}\nText: {text[:301]}\n")

    # premise generation
    premise = generate_premise(text_claim, best_docs, ollama_llm_name)

    # get best document URLs
    best_doc_urls = [url for url, _, _ in best_docs]

    # return premise and best document URLs
    return premise, best_doc_urls

## Test Claim

**Notes**

*   Claims in English are processed better than claims in Filipino. Seek more robust (maybe multilingual-LLM-based) solutions as a possible optimization step.

*   With limited testing, decomposing claims into multiple subclaims have yet to prove useful. It multiplies the processing time (2-5x), but Google SEO seems to be powerful enough with just one query.
   
*   Better reranking scores (~20%) when using multilingual embedding models for claims/documents in Filipino. Multilingual models allow for shared embedding spaces across languages, e.g. mixed English/Filipino documents.

In [20]:
claim = ""
image_path = "../datasets/documents/1.png"
link = "https://www.factrakers.org/post/vico-sotto-atasha-muhlach-pregnancy-hoax-resurfaces"

print("\nProcessing claim...\n")
generated_premise, retrieved_urls = pipeline(claim, image_path)
print("\nFinished processing!\n")

print(f"Generated Premise:\n{generated_premise}\n")
print(f"Retrieved URLs:\n{"".join(url + '\n' for url in retrieved_urls)}")


Processing claim...

Generating query from image: '../datasets/documents/1_processed.png'
Generating query from text: 'AGA MULACH ACCEPTED ATASHA MUH-LACH'S PREGNANCY WITH VICO SOTTO'
Searching with query: 'aga mulach accepted atasha muh lach pregnancy vico sotto'
Found 5 URLs
Fetching article with URL: https://interaksyon.philstar.com/celebrities/2022/09/06/227243/what-le-bal-said-after-inviting-atasha-muhlach-to-paris-debutante-ball/ (SCRAPING)
Fetching article with URL: https://interaksyon.philstar.com/rumor-cop/2025/05/26/296882/fact-check-vico-sotto-atasha-muhlach-pregnancy-hoax-resurfaces/ (SCRAPING)
Fetching article with URL: https://www.factrakers.org/post/vico-sotto-atasha-muhlach-pregnancy-hoax-resurfaces (SCRAPING)
Fetching article with URL: https://interaksyon.philstar.com/celebrities/2022/11/28/235995/pinoy-pride-heart-evangelista-cheers-atasha-muhlach-for-representing-ph-in-paris-ball/ (SCRAPING)
Fetching article with URL: https://www.factrakers.org/post/message-on-coron

# DATASET

In [21]:
def get_df(path, output_dir="data/extracted_images"):
    wb = load_workbook(path)
    ws = wb.active

    os.makedirs(output_dir, exist_ok=True)

    # load text
    data = []
    headers = [cell.value for cell in ws[1]]
    for row in ws.iter_rows(min_row=2, values_only=True):
        data.append(list(row))
    df = pd.DataFrame(data, columns=headers)
    df["Image Path"] = None

    # extract images and map to row numbers
    for idx, img in enumerate(ws._images):
        anchor = img.anchor
        if isinstance(anchor, str):
            ref = anchor
        elif hasattr(anchor, "_from"):
            ref = f"{chr(anchor._from.col + 65)}{anchor._from.row + 1}"
        else:
            ref = None

        img_bytes = io.BytesIO(img._data())
        pil_img = Image.open(img_bytes)

        filename = f"img_{idx+1}_{ref or 'unknown'}.png"
        filepath = os.path.join(output_dir, filename)
        pil_img.save(filepath)

        # match image to row
        if ref:
            try:
                row_num = int(''.join(filter(str.isdigit, ref)))
                df.loc[row_num - 2, "Image Path"] = filepath
            except ValueError:
                pass

    return df

In [22]:
def process_dataset(pipeline, df: pd.DataFrame) -> pd.DataFrame:
    
    def process_row(row, pipeline) -> pd.Series:
        claim = row.get("Hypothesis/Claims")
        image_path = row.get("Image Path")
        link = row.get("Link")

        # validate inputs
        if not isinstance(claim, str) or not claim.strip():
            return pd.Series({
                'Generated Premise': None,
                'Match': False,
                'Retrieved_URLs': []
            })
        if not (isinstance(image_path, str) and os.path.exists(image_path)):
            image_path = None

        # pipeline execution
        try:
            premise, urls = pipeline(claim, image_path)
            urls = urls or []
        except Exception as e:
            print(f"Error processing claim '{claim[:50]}...': {e}")
            premise, urls = None, []

        # check for matching urls
        match = False
        if link and urls:
            normalized_link = link.lower().strip()
            match = any(normalized_link == url.lower().strip() for url in urls)
        
        # print premise
        print(f"Generated Premise: '{premise}'")
        print(f"---- ---- ---- ---- ---- ----")

        return pd.Series({
            'Generated Premise': premise,
            'Match': match,
            'Retrieved_URLs': urls
        })
    
    # process and merge original df
    result_df = df.apply(lambda row: process_row(row, pipeline), axis=1)
    df[["Generated Premise", "Match", "Retrieved_URLs"]] = result_df
    
    return df

In [23]:
input_path = "../datasets/sheets/dataset.xlsx"
input_df = get_df(input_path)

# # temp: only process first 10 rows
# input_df = input_df.head(10)

output_df = process_dataset(pipeline, input_df)
print("\nDataset processing complete.")

Generating query from image: 'data/extracted_images\img_1_H2_processed.png'
Generating query from text: 'Guerrero Noticias Facebook page underwater wreckage deep-sea submersible Titan'
Searching with query: 'guerrero noticias facebook page underwater wreckage deep sea submersible titan'
Found 1 URLs
Fetching article with URL: https://www.rappler.com/newsbreak/fact-check/facebook-post-shows-no-real-images-titan-submersible-wreckage/ (SCRAPING)
Saving updated web content cache...
Embedding 1 documents...
Generating query embedding...
Query embedding dimension: 1024
Successfully embedded 1 documents.

Reranking 1 documents...
https://www.rappler.com/newsbreak/fact-check/facebook-post-shows-no-real-images-titan-submersible-wreckage/
Score: 0.777
Text: Disaster Fact Checks FACT CHECK: Facebook post shows no real images of Titan submersible wreckage Jun 23, 2023 11:04 PM PHT Rappler.com SUMMARY This is AI generated summarization, which may have errors. For context, always refer to the full a

In [24]:
nb_path = ipynbname.path()
pipeline_name = os.path.splitext(os.path.basename(str(nb_path)))[0]
OUTPUT_DIR = os.path.join(os.getcwd(), "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(OUTPUT_DIR, f"{pipeline_name}.xlsx")
output_df.to_excel(output_path, index=False)