Add a figure of methodology framework

# SETUP

## Imports

In [106]:
import base64
import io
import os
import re
import time
from typing import Optional

import nltk
import numpy as np
import pandas as pd
import requests
import torch
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from googleapiclient.discovery import build
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from openpyxl import load_workbook
from openpyxl.drawing.spreadsheet_drawing import OneCellAnchor, TwoCellAnchor
from PIL import Image


In [107]:
# get .env keys
load_dotenv()
API_KEY = os.getenv("API_KEY")
CSE_ID = os.getenv("CSE_ID")

## Ollama

In [108]:
# check for Ollama instance
for i in range(30):
    try:
        r = requests.get("http://localhost:11434/api/tags", timeout=1)
        if r.status_code == 200:
            print("Ollama served at http://localhost:11434/")
            break
    except Exception:
        time.sleep(1)
else:
    raise RuntimeError("Ollama failed to start.")

Ollama served at http://localhost:11434/


In [109]:
# pull models from https://ollama.com/library
# ! ollama pull qwen3-embedding:0.6b
# ! ollama pull qwen3:0.6b
# ! ollama pull qwen2.5vl:3b

# CLAIM PROCESSING


In [110]:
# download nltk resources
NLTK_DATA_DIR = os.path.join(os.getcwd(), "cache/nltk_data")
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
nltk.data.path.append(NLTK_DATA_DIR)
nltk.download("punkt", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("punkt_tab", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("stopwords", quiet=True, download_dir=NLTK_DATA_DIR)
nltk.download("wordnet", quiet=True, download_dir=NLTK_DATA_DIR)

True

## Text Processing

In [111]:
def make_query(claim: str) -> str:
    """
    Apply basic preprocessing to convert a claim into a keyword-based search query.
    """
    print(f"Generating query from text: '{claim}'")

    # normalization
    text = claim.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)

    # tokenization
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    lemmas = list(dict.fromkeys(lemmas))

    query = " ".join(lemmas)
    return query

## Image Processing

In [112]:
def caption(image_path: str, text_claim: Optional[str], model: str):
    """
    Generate descriptive text claims from an image claim. If text claim already exists, add more context from image.
    """
    print(f"Generating query from image: '{image_path}'")

    with open(image_path, "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode("utf-8")

    if text_claim.strip():
        prompt = f"""
        From the provided image, add more context to the text statement for fact-checking.
        Keep the statement concise and optimal as a search query.
        Respond only with the new statement as plain text.

        Statement: {text_claim}
        """
    else:
        prompt = f"""
        From the provided image, extract the statements to be fact-checked.
        Keep the statement concise and optimal as a search query.
        Respond only with the statement as plain text.
        """

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model, "prompt": prompt, "images": [image_b64], "stream": False},
    )

    return response.json().get("response", "").strip()

# RETRIEVAL

In [113]:
def search(query: str, num_results: int) -> list[str]:
    """
    Retrieve URLs using Google Custom Search API.
    Return a list of string URLs.
    """
    print(f"Searching with query: '{query}'")

    service = build("customsearch", "v1", developerKey=API_KEY)
    res = service.cse().list(q=query, cx=CSE_ID, num=num_results).execute()
    urls = []
    for item in res.get("items", []):
        urls.append(item["link"])

    print(f"Found {len(urls)} URLs")
    return urls

In [114]:
def fetch_text(url: str) -> str:
    """
    Fetch article text from a given URL.
    Return the string body text from the HTML content.
    """
    print(f"Fetching article with URL: {url}")
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        if r.status_code != 200:
            print(f"    Failed to fetch: ({r.status_code})")
            return ""
        soup = BeautifulSoup(r.text, "html.parser")
        ps = soup.select("article p, .entry-content p, p")
        text = " ".join(p.get_text(strip=True) for p in ps)
        return text if text.strip() else ""
    except Exception as e:
        print(f"    Error while fetching: {e}")
        return ""

In [None]:
def retrieve(urls: list[str]) -> list[tuple[str, str]]:
    """
    Retrieve documents from search results based on a query.
    Return list of (url, text) tuples.
    """
    documents = []
    for url in urls:
        document_text = fetch_text(url)
        if document_text.strip():
            documents.append((url, document_text))

    print(f"Successfully retrieved {len(documents)} documents.\n")
    return documents

# EMBEDDING AND RERANKING



## Embedding

In [116]:
def embed(
    query: str, documents: list[tuple[str, str]], model_name: str
) -> tuple[torch.Tensor, torch.Tensor, list[str], list[str]]:
    """
    Generate embeddings for a query and a list of documents using Ollama.
    Return query_embeddings, document_embeddings, urls, document_texts
    """
    if not documents:
        raise RuntimeError("No documents provided for embedding.")

    print(f"Embedding {len(documents)} documents...")

    # helper function
    def ollama_embed(text: str):
        try:
            res = requests.post(
                "http://localhost:11434/api/embeddings",
                json={"model": model_name, "prompt": text},
            )
            data = res.json()
            return data.get("embedding", [])
        except Exception as e:
            print(f"    Error getting embedding: {e}")
            return []

    # query embedding
    print("Generating query embedding...")
    query_vec = ollama_embed(query)
    if not query_vec:
        raise RuntimeError("Query embedding failed.")

    expected_dim = len(query_vec)
    print(f"Query embedding dimension: {expected_dim}")

    # document embeddings
    doc_vecs = []
    valid_urls = []
    valid_texts = []

    for url, text in documents:
        print(f"Embedding document with URL: {url}")
        vec = ollama_embed(text)
        if not vec:
            print(f"    WARNING: Empty embedding for document. Skipping")
            continue
        if len(vec) != expected_dim:
            print(f"    WARNING: Embedding dimension mismatch ({len(vec)} vs {expected_dim}). Skipping.")
            continue
        doc_vecs.append(vec)
        valid_urls.append(url)
        valid_texts.append(text)

    if not doc_vecs:
        raise RuntimeError("No valid document embeddings generated.")
    print(f"Successfully embedded {len(doc_vecs)} documents.\n")

    query_embeddings = torch.tensor(np.array([query_vec]), dtype=torch.float32)
    document_embeddings = torch.tensor(np.array(doc_vecs), dtype=torch.float32)

    return query_embeddings, document_embeddings, valid_urls, valid_texts

## Reranking

In [117]:
def rerank(
    query_embeddings: Optional[torch.Tensor],
    document_embeddings: Optional[torch.Tensor],
    urls: list[str],
    document_texts: list[str],
    top_k: int = 3,
) -> list[tuple[str, str, float]]:
    """
    Rerank precomputed embeddings using cosine similarity.
    Return a list of (url, text, score) tuples sorted by relevance score.
    """
    if query_embeddings is None or document_embeddings is None:
        raise RuntimeError("Query or document embeddings not found.")
    print(f"Reranking {len(document_texts)} documents...")

    query_norm = query_embeddings / query_embeddings.norm(dim=1, keepdim=True)
    doc_norms = document_embeddings / document_embeddings.norm(dim=1, keepdim=True)

    scores = torch.mm(query_norm, doc_norms.T)[0].cpu().numpy()

    ranked = list(zip(urls, document_texts, scores))
    ranked.sort(key=lambda x: x[2], reverse=True)

    return ranked[:top_k]

# PREMISE GENERATION

In [118]:
def generate_premise(claim: str, documents: list[str], model: str):
    """
    Summarize the evidence retrieved for a claim into a short premise.
    """
    joined_documents = "\n".join([f"- {document}" for document in documents])

    prompt = f"""
    Claim: "{claim}"

    Documents:
    {joined_documents}

    From the fact-checking articles, extract the basis that supports or refutes the claim.
    Ignore documents that neither support nor refute the claim.
    Only give the evidence as plain text.
    """

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model, "prompt": prompt, "stream": False},
    )

    return response.json().get("response", "").strip()

# PIPELINE

In [119]:
def pipeline(text_claim: str, image_path: Optional[str]) -> list[tuple[str, str, float]]:
    """
    Complete IR pipeline: retrieve, rerank, and return top documents.
    Return a list of (url, text, score) tuples for top_k most relevant documents.
    """
    # parameters
    ollama_emb_name = "qwen3-embedding:0.6b"
    ollama_llm_name = "qwen3:0.6b"
    ollama_vlm_name = "qwen2.5vl:3b"
    num_results = 5
    top_k = 1

    # preprocessing
    if image_path:
        image_claim = caption(image_path, text_claim, ollama_vlm_name)
        query = make_query(image_claim)
        if not text_claim.strip():
            text_claim = image_claim # if image only, use response as text claim
    else:
        query = make_query(text_claim)

    # retrieval
    urls = search(query, num_results)

    return urls

    documents = retrieve(urls)
    if not documents:
        premise = "No documents related to the claim were found."
        return premise

    # embedding
    claim_embeddings, document_embeddings, urls, document_texts = embed(text_claim, retrieved_documents, ollama_emb_name)

    # reranking
    best_docs = rerank(claim_embeddings, document_embeddings, urls, document_texts, top_k)
    for url, text, score in best_docs:
        print(f"{url}\nScore: {score:.3f}\nText: {text[101:301]}\n")

    # premise generation
    premise = generate_premise(text_claim, best_docs, ollama_llm_name)

    return premise, best_docs[0][0]

## Test Claim

**Notes**

*   Claims in English are processed better than claims in Filipino. Seek more robust (maybe multilingual-LLM-based) solutions as a possible optimization step.

*   With limited testing, decomposing claims into multiple subclaims have yet to prove useful. It multiplies the processing time (2-5x), but Google SEO seems to be powerful enough with just one query.
   
*   Better reranking scores (~20%) when using multilingual embedding models for claims/documents in Filipino. Multilingual models allow for shared embedding spaces across languages, e.g. mixed English/Filipino documents.

For image input: Show the image, side-by-side with the output text

For text input: print excerpts/snippets of retrieved documents

In [120]:
claim = "Vico Sotto and Atasha Mulach pregnancy"
image_path = "./datasets/documents/1.png"
link = "https://www.factrakers.org/post/vico-sotto-atasha-muhlach-pregnancy-hoax-resurfaces"

print("\nProcessing claim...\n")
urls = pipeline(claim, image_path)
print("\nFinished processing!\n")

if link in urls:
    print(f"MATCH: {link}")
else:
    print(f"NO MATCH: {urls}")


Processing claim...

Generating query from image: './datasets/documents/1.png'
Generating query from text: 'Vico Sotto and Atasha Mulach pregnancy confirmed'
Searching with query: 'vico sotto atasha mulach pregnancy confirmed'
Found 5 URLs

Finished processing!

MATCH: https://www.factrakers.org/post/vico-sotto-atasha-muhlach-pregnancy-hoax-resurfaces


# DATASET

In [121]:
def get_df(path, output_dir="datasets/extracted_images"):
    wb = load_workbook(path)
    ws = wb.active

    os.makedirs(output_dir, exist_ok=True)

    # load all text data
    data = []
    headers = [cell.value for cell in ws[1]]
    for row in ws.iter_rows(min_row=2, values_only=True):
        data.append(list(row))
    df = pd.DataFrame(data, columns=headers)

    # column for image paths
    df["Image Path"] = None

    # extract images and map them to row numbers
    for idx, img in enumerate(ws._images):
        anchor = img.anchor
        if isinstance(anchor, str):
            ref = anchor
        elif hasattr(anchor, "_from"):
            ref = f"{chr(anchor._from.col + 65)}{anchor._from.row + 1}"
        else:
            ref = None

        img_bytes = io.BytesIO(img._data())
        pil_img = Image.open(img_bytes)

        filename = f"img_{idx+1}_{ref or 'unknown'}.png"
        filepath = os.path.join(output_dir, filename)
        pil_img.save(filepath)

        # match image to row
        if ref:
            try:
                row_num = int(''.join(filter(str.isdigit, ref)))
                df.loc[row_num - 2, "Image Path"] = filepath
            except ValueError:
                pass

    return df

In [122]:
def process_dataset(pipeline, df: pd.DataFrame):
    matches = []

    for _, row in df.iterrows():
        claim = row.get("Hypothesis/Claims")
        image_path = row.get("Image Path")
        link = row.get("Link")

        if not isinstance(claim, str) or not claim.strip():
            matches.append(None)
            continue

        if not (isinstance(image_path, str) and os.path.exists(image_path)):
            image_path = None

        try:
            url = pipeline(claim, image_path)
        except Exception as e:
            print(f"Error processing row: {e}")
            url = None

        match = link in url     
        if match:
            print(f"MATCH: {link}\n--- --- --- --- ---\n")
        else:
            print(f"NO MATCH:\nOriginal: {link}\nRetrieved: {url}\n--- --- --- --- ---\n")
        matches.append(match)
    
    df["Match"] = matches    
    return df

In [123]:
input_df = get_df("datasets/sheets/Patunai-IR-image-augemented-dataset.xlsx")
output_df = process_dataset(pipeline, input_df)
print("\nDataset processing complete.")

Generating query from image: 'datasets/extracted_images\img_1_H2.png'
Generating query from text: 'A post on the Facebook page Guerrero Noticias features a series of pictures of underwater wreckage, which is allegedly related to the missing deep-sea submersible Titan.'
Searching with query: 'post facebook page guerrero noticias feature series picture underwater wreckage allegedly related missing deep sea submersible titan'
Found 1 URLs
MATCH: https://www.rappler.com/newsbreak/fact-check/facebook-post-shows-no-real-images-titan-submersible-wreckage/
--- --- --- --- ---

Generating query from text: 'As of writing (Jun 23, 2023), no images of the Titan’s wreckage and debris have been released to the media.'
Searching with query: 'writing jun 2023 image titan wreckage debris released medium'
Found 1 URLs
MATCH: https://www.rappler.com/newsbreak/fact-check/facebook-post-shows-no-real-images-titan-submersible-wreckage/
--- --- --- --- ---

Generating query from image: 'datasets/extracted_ima

In [124]:
output_df.to_excel("datasets/output/retrieval_only_image_caption.xlsx", index=False)

## Retrieval Accuracy

In [125]:
def show_match_score(df: pd.DataFrame):
    if "Match" not in df.columns:
        raise ValueError("DataFrame must contain a 'Match' column.")

    valid_matches = df["Match"].dropna()

    if len(valid_matches) == 0:
        print("No valid matches found in dataset.")
        return 0.0

    score = valid_matches.mean()
    print(f"Retrieval accuracy: {score:.2%} ({valid_matches.sum()}/{len(valid_matches)} correct)")

In [126]:
show_match_score(output_df)

Retrieval accuracy: 68.00% (34/50 correct)
