In [None]:
# Clean out partial builds
!pip uninstall -y mim mmcv
!pip install -U pip setuptools wheel
!pip install "torch>=2.1" "mmcv==2.1.0" "mmdet==3.3.0" --no-cache-dir


Collecting mmcv==2.1.0
  Downloading mmcv-2.1.0.tar.gz (471 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mmdet==3.3.0
  Downloading mmdet-3.3.0-py3-none-any.whl.metadata (29 kB)
Collecting addict (from mmcv==2.1.0)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting mmengine>=0.3.0 (from mmcv==2.1.0)
  Downloading mmengine-0.10.7-py3-none-any.whl.metadata (20 kB)
Collecting yapf (from mmcv==2.1.0)
  Downloading yapf-0.43.0-py3-none-any.whl.metadata (46 kB)
Collecting terminaltables (from mmdet==3.3.0)
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading mmdet-3.3.0-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m126.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading mmengine-0.10.7-py3-none-any.whl (452 kB)
Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)
Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)
Downloading yapf-0.43.0-py3-none-an

In [2]:
import json

dataset_path = '/content/sample_data/1hop.json'

with open(dataset_path, 'r') as f:
    data = json.load(f)

print(f"Loaded {len(data)} entries.")
print("\nExample entry structure:\n")
print(json.dumps(data[0], indent=2)[:1000])  # Print first entry (truncated)


FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/1hop.json'

In [None]:
import json
import os
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms

class OneHopDataset(Dataset):
    def __init__(self, json_path, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

        # Load JSON
        with open(json_path, 'r') as f:
            self.data = json.load(f)

        # Filter or validate entries if needed
        print(f"Loaded {len(self.data)} entries from {json_path}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # --- Parse fields ---
        question = item.get("question", "")
        answer = item.get("answer", "")
        image_filename = item.get("image", None)

        # --- Load image if available ---
        if image_filename:
            image_path = os.path.join(self.img_dir, image_filename)
            try:
                image = Image.open(image_path).convert("RGB")
            except FileNotFoundError:
                image = Image.new("RGB", (224, 224), color=(0, 0, 0))
        else:
            image = Image.new("RGB", (224, 224), color=(0, 0, 0))

        image = self.transform(image)

        return {
            "question": question,
            "answer": answer,
            "image": image
        }

In [None]:
from torch.utils.data import DataLoader

dataset = OneHopDataset(json_path='/content/sample_data/1hop.json', img_dir='/content/images')
loader = DataLoader(dataset, batch_size=2, shuffle=True)

for batch in loader:
    print(batch['image'].shape)
    print(batch['question'])
    break


In [None]:
from mmcv.transforms import Compose

pipeline = Compose([
    dict(type='Resize', scale=(1333, 800)),
    dict(type='RandomFlip', prob=0.5)
])

dataset = OneHopDataset('/content/sample_data/1hop.json', '/content/images', transform=pipeline)


Loaded 5884 entries from /content/sample_data/1hop.json


In [None]:
import json

# Load the dataset
dataset_path = '/content/sample_data/1hop.json'
with open(dataset_path, 'r') as f:
    data = json.load(f)

# Filter: keep entries that have non-empty text evidence
filtered_data = [entry for entry in data if entry.get('text_evidence')]

print(f"Original entries: {len(data)}")
print(f"Entries with text evidence: {len(filtered_data)}")

# Save the filtered dataset
filtered_path = '/content/sample_data/1hop_text_only.json'
with open(filtered_path, 'w') as f:
    json.dump(filtered_data, f, indent=2)


Original entries: 5884
Entries with text evidence: 2590


In [None]:
!pip install sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss
import json
import numpy as np

# Load your filtered dataset
dataset_path = '/content/sample_data/1hop_text_only.json'
with open(dataset_path, 'r') as f:
    data = json.load(f)

# Combine all text_evidence into a single list per entry
corpus = []
corpus_ids = []  # To track which entry each text belongs to
for idx, entry in enumerate(data):
    for text in entry['text_evidence']:
        corpus.append(text)
        corpus_ids.append(idx)

# Step 1: Create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus, show_progress_bar=True, convert_to_numpy=True)

# Step 2: Build FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity
faiss.normalize_L2(embeddings)  # Normalize for cosine similarity
index.add(embeddings)

print(f"Retriever built with {len(corpus)} text evidences.")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/81 [00:00<?, ?it/s]

Retriever built with 2590 text evidences.


In [None]:
def retrieve_topk(claim, model, index, corpus, corpus_ids, k=5):
    # Encode the query claim
    query_emb = model.encode([claim], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)

    # Search FAISS index
    D, I = index.search(query_emb, k)  # D = scores, I = indices

    # Get the actual text using indices
    retrieved_texts = [corpus[i]['wiki_context'] if isinstance(corpus[i], dict) else corpus[i] for i in I[0]]
    retrieved_entry_ids = [corpus_ids[i] for i in I[0]]

    return retrieved_texts, retrieved_entry_ids, D[0]

# Example claim
claim = "Airport security measures include baggage screening and passenger checks."

top_texts, top_ids, scores = retrieve_topk(claim, model, index, corpus, corpus_ids, k=5)

# Print top texts with scores
for i, (text, score) in enumerate(zip(top_texts, scores)):
    print(f"{i+1}. Score: {score:.4f}")
    print(f"Text evidence: {text[:500]}...\n")  # show first 500 characters for readability


1. Score: 0.1838
Text evidence: ecfef2d41f2a858f0082ee4b0a5ae015...

2. Score: 0.1807
Text evidence: af8111d29390321ab7b3bf3cf5bbba70...

3. Score: 0.1766
Text evidence: afa4b8e19e0abb72602f2b3171810ef3...

4. Score: 0.1766
Text evidence: af91a721e80a160194a8a5d5f4bc980d...

5. Score: 0.1764
Text evidence: afdc647c080430a0342ec80dd7bf4bea...



In [None]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# -------------------------------
# Step 0: Load your cleaned 1hop dataset
# -------------------------------
dataset_path = '/content/sample_data/1hop.json'
with open(dataset_path, 'r') as f:
    data = json.load(f)

# Filter entries that have text evidence
data = [entry for entry in data if entry.get('text_evidence')]

print(f"Number of entries with text evidence: {len(data)}")

# -------------------------------
# Step 1: Build corpus and ID mapping
# -------------------------------
corpus = []      # actual wiki_context texts
corpus_ids = []  # corresponding text IDs (first ID in text_evidence)

for entry in data:
    for te_id in entry['text_evidence']:
        corpus.append(entry['wiki_context'])  # use full wiki_context or split if needed
        corpus_ids.append(te_id)

print(f"Corpus size: {len(corpus)}")

# -------------------------------
# Step 2: Encode texts with SentenceTransformer
# -------------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(corpus, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings for cosine similarity
faiss.normalize_L2(corpus_embeddings)

# -------------------------------
# Step 3: Build FAISS index
# -------------------------------
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = cosine similarity (after normalization)
index.add(corpus_embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

# -------------------------------
# Step 4: Retrieve top-k texts for a claim
# -------------------------------
def retrieve_topk(claim, model, index, corpus, corpus_ids, k=5):
    # Encode the query claim
    query_emb = model.encode([claim], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)

    # Search FAISS index
    D, I = index.search(query_emb, k)  # D = similarity scores, I = indices
    retrieved_texts = [corpus[i] for i in I[0]]
    retrieved_entry_ids = [corpus_ids[i] for i in I[0]]

    return retrieved_texts, retrieved_entry_ids, D[0]

# Example usage
claim = "Airport security measures include baggage screening and passenger checks."
top_texts, top_ids, scores = retrieve_topk(claim, model, index, corpus, corpus_ids, k=5)

for i, (text, score, te_id) in enumerate(zip(top_texts, scores, top_ids)):
    print(f"{i+1}. Score: {score:.4f}, Text Evidence ID: {te_id}\n{text}\n")


Number of entries with text evidence: 2590
Corpus size: 2590


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

FAISS index built with 2590 vectors
1. Score: 0.3660, Text Evidence ID: 577751c1aa35a3e3883ae21b5c39ee00
Billy Bishop Toronto City Airport (IATA: YTZ, ICAO: CYTZ) is a regional airport located on the Toronto Islands in Toronto, Ontario, Canada. It is often referred to as Toronto Island Airport and was previously known as Port George VI Island Airport and Toronto City Centre Airport. The airport's name honours Billy Bishop, the Canadian World War I flying ace and World War II Air Marshal. It is used by civil aviation, air ambulances, and regional airlines using turboprop planes. In 2022, it was ranked Canada's ninth-busiest airport.

2. Score: 0.3475, Text Evidence ID: d0db7027ad6a84b1bbe78d6d89fa14f9
Ronald Reagan Washington National Airport (IATA: DCA, ICAO: KDCA, FAA LID: DCA) is a public airport located in Crystal City, in Arlington County, Virginia, United States, 5 miles (8.0 km) from Washington, D.C. It is the closest airport to Washington, D.C., the nation's capital, the 24th-bu

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# 1️⃣ Embedding model for retrieval
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 2️⃣ LLM model for reasoning
llm_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(llm_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_name)
llm_pipe = pipeline("text2text-generation", model=llm_model, tokenizer=tokenizer)

# Now retrieve top-k evidence using the embedding model
claim = "Airport security measures include baggage screening and passenger checks."
top_texts, top_ids, scores = retrieve_topk(claim, embedding_model, index, corpus, corpus_ids, k=5)

# Pass retrieved evidence to LLM
prompt = build_prompt(claim, top_texts)
output = llm_pipe(prompt, max_length=256, do_sample=False)
print(output[0]['generated_text'])


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (982 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


SUPPORTED because airport security measures include baggage screening and passenger checks. The airport's name honours Billy Bishop, the Canadian World War I flying ace and World War II Air Marshal. It is used by civil aviation, air ambulances, and regional airlines using turboprop planes. In 2022, it was ranked Canada's ninth-busiest airport. Ronald Reagan Washington National Airport (IATA: DCA, ICAO: KDCA, FAA LID: DCA) is a public airport located in Crystal City, in Arlington County, Virginia, United States, 5 miles (8.0 km) from Washington, D.C., the nation's capital, the 24th-busiest airport in the nation, the busiest airport in the Washington metropolitan area , and the second busiest in the Washington–Baltimore combined statistical area. The airport opened in 1941 and was originally named Washington National Airport. Part of the original terminal is still in use as Terminal 1. A larger second terminal, now known as Terminal 2, opened in 1997. In 1998, Congress passed and Preside