In [1]:
!pip install -q \
    pandas pyarrow datasets==2.19.* \
    sentence-transformers==2.7.0 \
    faiss-cpu==1.8.0 \
    rank_bm25==0.2.2 \
    nltk==3.8.1 \
    tqdm==4.66.1 \
    transformers==4.40.* \
    accelerate==0.29.* \
    bitsandbytes==0.43.* 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m98.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   

In [2]:
import os
import json
import gc
import time
import psutil
from pathlib import Path
import warnings

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import nltk

from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import faiss

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# --- Basic Configuration ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false" 
nltk.download('punkt', quiet=True)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# For reproducibility with some operations
if torch.cuda.is_available() and hasattr(torch.backends.cudnn, 'deterministic'):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

Using device: cuda


In [6]:
DATA_DIR = Path("/kaggle/input/all-the-news-dataset") 
MUST_ENTS  = ["Ossetia", "Abkhazia", "Brexit", "COVID-19", "Ukraine War", "Gaza conflict"] 
TARGET_SZ  = 30_000 

In [7]:

use_cols   = {"title", "article", "publication", "url"} # Start with the most critical and clearly named ones

must_rows, other_rows = [], []

def csv_iter(data_dir, use_cols_set):
    for fp in data_dir.glob("*.csv"):
        try:
            
            df_peek = pd.read_csv(fp, nrows=0)
            current_file_cols = set(df_peek.columns)
            cols_to_read = list(use_cols_set.intersection(current_file_cols))
            if not cols_to_read: # Skip if no relevant columns
                continue
            
            # Check for 'article' vs 'content'
            if 'content' in cols_to_read and 'article' not in cols_to_read and 'article' in use_cols_set:
                # If 'content' exists and 'article' is desired but not in file, map later
                pass
            elif 'article' in cols_to_read and 'content' not in cols_to_read and 'content' in use_cols_set:
                 # If 'article' exists and 'content' is desired but not in file, map later
                pass


            yield from pd.read_csv(fp, usecols=cols_to_read,
                               dtype=str, low_memory=False,
                               chunksize=20_000) # Smaller chunksize for memory
        except Exception as e:
            print(f"Skipping file {fp} due to error: {e}")
            continue

for chunk_idx, chunk in enumerate(tqdm(csv_iter(DATA_DIR, use_cols), desc="Scanning CSVs")):
    # Standardize 'article' column name
    if "content" in chunk.columns and "article" not in chunk.columns:
        chunk = chunk.rename(columns={"content": "article"})
    elif "article" in chunk.columns and "content" not in chunk.columns:
        # We'll use 'article' as the standard internally
        pass
    elif "content" in chunk.columns and "article" in chunk.columns:
        # If both exist, prioritize 'article' or define a rule
    
        chunk['article'] = chunk['article'].fillna(chunk['content'])
        chunk = chunk.drop(columns=['content'], errors='ignore')


    if "article" not in chunk.columns: # Skip if no article content
        continue

    chunk = chunk.dropna(subset=["article"])
    chunk['title'] = chunk['title'].fillna("") # Fill NaN titles

    mask  = chunk["article"].str.contains("|".join(MUST_ENTS), case=False, na=False)
    must_rows.append(chunk[mask])
    other_rows.append(chunk[~mask])
    

if not must_rows and not other_rows:
    raise ValueError("No data loaded. Check DATA_DIR path and CSV files.")

must_df  = pd.concat(must_rows,  ignore_index=True) if must_rows else pd.DataFrame()
other_df = pd.concat(other_rows, ignore_index=True) if other_rows else pd.DataFrame()

print(f"⚑ forced-entities found: {len(must_df)} rows")
print(f"Other documents found: {len(other_df)} rows")

if len(must_df) >= TARGET_SZ:
    df = must_df.sample(n=TARGET_SZ, random_state=RANDOM_SEED, replace=False)
    print(f"Sampled 'must_df' down to {TARGET_SZ} rows as it met or exceeded target size.")
else:
    remaining_needed = TARGET_SZ - len(must_df)
    if len(other_df) >= remaining_needed:
        sample_other = other_df.sample(n=remaining_needed, random_state=RANDOM_SEED, replace=False)
        df = pd.concat([must_df, sample_other], ignore_index=True)
    else: # If other_df is too small to fill the remaining
        df = pd.concat([must_df, other_df], ignore_index=True)
        print(f"Warning: Could only gather {len(df)} rows, less than target {TARGET_SZ}, as 'other_df' was too small.")

# Create a unique ID for each document
df['doc_id'] = range(len(df))
df["text_to_embed"] = df["title"].fillna("") + ". " + df["article"].str.strip()

print(f"✅ Final subset shape: {df.shape}")
print(df.head())

# Clean up large intermediate dataframes
del must_rows, other_rows, must_df, other_df
gc.collect()

Scanning CSVs: 0it [00:00, ?it/s]

⚑ forced-entities found: 79165 rows
Other documents found: 2505000 rows
Sampled 'must_df' down to 30000 rows as it met or exceeded target size.
✅ Final subset shape: (30000, 6)
                                                   title  \
26980  Credit Suisse to cut jobs as it pares back in ...   
77592  Iconic photos: The USNS Comfort arriving in Ne...   
27824  New Manhattan condos break price records, for now   
43987  Twitter Will Let You Temporarily Follow Olympi...   
66340  Ford to shut Spanish factory for one week due ...   

                                                 article  \
26980  ZURICH (Reuters) - Credit Suisse (CSGN.S) is t...   
77592  Two days after President Donald Trump took par...   
27824  The numbers are soaring higher than the buildi...   
43987  Twitter is about to embark on a 17 day-long te...   
66340  MADRID (Reuters) - Ford (F.N) said on Sunday i...   

                                                     url    publication  \
26980  https://www.reuters

157

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter # Using LangChain's splitter

# --- Chunking Parameters ---
CHUNK_SIZE = 500  # Characters
CHUNK_OVERLAP = 50 # Characters

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    add_start_index=True,
)

all_chunks = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Chunking documents"):
    chunks = text_splitter.split_text(row['text_to_embed'])
    for chunk_text in chunks:
        all_chunks.append({
            'doc_id': row['doc_id'], # Link back to original document
            'original_title': row['title'],
            'original_url': row.get('url', ''), # Get URL if exists
            'chunk_text': chunk_text
        })

chunks_df = pd.DataFrame(all_chunks)
chunks_df['chunk_id'] = range(len(chunks_df)) # Unique ID for each chunk

print(f"Total documents: {len(df)}")
print(f"Total chunks created: {len(chunks_df)}")
print(chunks_df.head())


Chunking documents:   0%|          | 0/30000 [00:00<?, ?it/s]

Total documents: 30000
Total chunks created: 278947
   doc_id                                     original_title  \
0       0  Credit Suisse to cut jobs as it pares back in ...   
1       0  Credit Suisse to cut jobs as it pares back in ...   
2       0  Credit Suisse to cut jobs as it pares back in ...   
3       0  Credit Suisse to cut jobs as it pares back in ...   
4       0  Credit Suisse to cut jobs as it pares back in ...   

                                        original_url  \
0  https://www.reuters.com/article/us-britain-eu-...   
1  https://www.reuters.com/article/us-britain-eu-...   
2  https://www.reuters.com/article/us-britain-eu-...   
3  https://www.reuters.com/article/us-britain-eu-...   
4  https://www.reuters.com/article/us-britain-eu-...   

                                          chunk_text  chunk_id  
0  Credit Suisse to cut jobs as it pares back in ...         0  
1  talks with the European Union. UBS and Credit ...         1  
2  Chief Executive Tidjane Thia

In [9]:
# BM25 expects a list of lists of tokens
tokenized_chunks_for_bm25 = [doc.lower().split() for doc in tqdm(chunks_df['chunk_text'], desc="Tokenizing for BM25")]

start_time = time.time()
bm25 = BM25Okapi(tokenized_chunks_for_bm25)
end_time = time.time()

print(f"✅ Indexed {len(tokenized_chunks_for_bm25)} chunks using in-memory BM25")
print(f"⏱️ BM25 Indexing Time: {end_time - start_time:.2f} seconds")


Tokenizing for BM25:   0%|          | 0/278947 [00:00<?, ?it/s]

✅ Indexed 278947 chunks using in-memory BM25
⏱️ BM25 Indexing Time: 7.09 seconds


In [10]:
def search_bm25(query: str, k: int = 5):
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    
    # Get top k indices and their scores
    topk_indices_scores = sorted(zip(range(len(scores)), scores), key=lambda x: x[1], reverse=True)[:k]
    
    results = []
    for i, score in topk_indices_scores:
        chunk_info = chunks_df.iloc[i]
        results.append({
            'chunk_id': chunk_info['chunk_id'],
            'doc_id': chunk_info['doc_id'],
            'score': score,
            'text': chunk_info['chunk_text'],
            'title': chunk_info['original_title'],
            'url': chunk_info['original_url']
        })
    return results

# Test BM25
test_query_bm25 = "What happened in South Ossetia in 2008?"
bm25_results = search_bm25(test_query_bm25, k=3)
print(f"\nBM25 Results for: '{test_query_bm25}'")
for res in bm25_results:
    print(f"  Score: {res['score']:.4f} | Title: {res['title'][:50]}... | Chunk: {res['text'][:100]}...")


BM25 Results for: 'What happened in South Ossetia in 2008?'
  Score: 30.0291 | Title: Russia and Georgia tensions are rising: Here's why... | Chunk: pro-Russian) self-proclaimed republics of South Ossetia and Abkhazia. As such the parliamentary addr...
  Score: 29.6864 | Title: Front lines of a frozen conflict... | Chunk: By the numbers: There are roughly 240,000 people living in Abkhazia and 50,000 in South Ossetia. Ano...
  Score: 29.4312 | Title: Hundreds of Georgians demand release of doctor det... | Chunk: The United States and rights group Amnesty International called for Gaprindashvili’s immediate relea...


In [11]:
# --- Dense Retriever Parameters ---
EMBEDDING_MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1' 

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
print(f"Embedding model '{EMBEDDING_MODEL_NAME}' loaded. Max sequence length: {embedding_model.max_seq_length}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model 'multi-qa-MiniLM-L6-cos-v1' loaded. Max sequence length: 512


In [12]:
chunk_texts_to_embed = chunks_df['chunk_text'].tolist()

print(f"Generating embeddings for {len(chunk_texts_to_embed)} chunks...")
start_time = time.time()

chunk_embeddings = embedding_model.encode(
    chunk_texts_to_embed,
    batch_size=128, 
    show_progress_bar=True,
    convert_to_tensor=True,
    device=DEVICE
)
end_time = time.time()
print(f"✅ Embeddings generated. Shape: {chunk_embeddings.shape}")
print(f"⏱️ Embedding Time: {end_time - start_time:.2f} seconds")

# Move embeddings to CPU for FAISS if they were on GPU
chunk_embeddings_cpu = chunk_embeddings.cpu().numpy()
if DEVICE == "cuda":
    del chunk_embeddings # Free up GPU memory
    torch.cuda.empty_cache()
gc.collect()

Generating embeddings for 278947 chunks...


Batches:   0%|          | 0/2180 [00:00<?, ?it/s]

✅ Embeddings generated. Shape: torch.Size([278947, 384])
⏱️ Embedding Time: 447.59 seconds


17

In [13]:
embedding_dim = chunk_embeddings_cpu.shape[1]
# Using IndexFlatL2 for simplicity, but IndexFlatIP (Inner Product) is often better for cosine similarity with normalized embeddings
# If using cosine similarity, normalize embeddings before adding to FAISS and during query
# For dot product (often used by SBERT models like multi-qa-...), normalization might not be strictly needed if model output is already scaled

# Normalize embeddings for cosine similarity
faiss.normalize_L2(chunk_embeddings_cpu)

index_faiss = faiss.IndexFlatIP(embedding_dim) # IP (Inner Product) for normalized vectors is equivalent to cosine similarity
index_faiss.add(chunk_embeddings_cpu)

print(f"FAISS index built. Total vectors in index: {index_faiss.ntotal}")

# Save the index (optional, but good for large datasets)
# faiss.write_index(index_faiss, "news_chunks.faiss_index")
# chunks_df.to_csv("news_chunks_metadata.csv", index=False) # Save corresponding metadata

FAISS index built. Total vectors in index: 278947


In [14]:
def search_faiss(query: str, k: int = 5):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True, device=DEVICE)
    query_embedding_cpu = query_embedding.cpu().numpy().reshape(1, -1) # Reshape for FAISS
    faiss.normalize_L2(query_embedding_cpu) # Normalize query embedding

    distances, indices = index_faiss.search(query_embedding_cpu, k)
    
    results = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        score = distances[0][i]
        chunk_info = chunks_df.iloc[idx]
        results.append({
            'chunk_id': chunk_info['chunk_id'],
            'doc_id': chunk_info['doc_id'],
            'score': score, # This is dot product; higher is better
            'text': chunk_info['chunk_text'],
            'title': chunk_info['original_title'],
            'url': chunk_info['original_url']
        })
    return results

# Test FAISS
test_query_faiss = "What were the latest developments in the Brexit negotiations last year?"
faiss_results = search_faiss(test_query_faiss, k=3)
print(f"\nFAISS Results for: '{test_query_faiss}'")
for res in faiss_results:
    print(f"  Score: {res['score']:.4f} | Title: {res['title'][:50]}... | Chunk: {res['text'][:100]}...")

if DEVICE == "cuda" and 'query_embedding' in locals():
    del query_embedding
    torch.cuda.empty_cache()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


FAISS Results for: 'What were the latest developments in the Brexit negotiations last year?'
  Score: 0.7010 | Title: 10 things you should know about the Brexit breakth... | Chunk: 10 things you should know about the Brexit breakthrough. Brexit negotiators claimed Friday they had ...
  Score: 0.6907 | Title: Sterling falls as traders prepare for EU summit ne... | Chunk: near their lowest since May 2017, according to the latest positioning data. Brexit negotiations have...
  Score: 0.6836 | Title: Sterling volatility measures fall to 15-month lows... | Chunk: no significant Brexit-related developments were expected this week. (Reporting by Saikat Chatterjee ...


In [15]:
# --- Reranker Parameters ---
CROSS_ENCODER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # Small and fast
# CROSS_ENCODER_MODEL_NAME = 'cross-encoder/ms-marco-TinyBERT-L-2-v2' # Even smaller
# CROSS_ENCODER_MODEL_NAME = 'BAAI/bge-reranker-base' # Newer, often very good

reranker_model = CrossEncoder(CROSS_ENCODER_MODEL_NAME, device=DEVICE, max_length=512) # max_length depends on model
print(f"Reranker model '{CROSS_ENCODER_MODEL_NAME}' loaded.")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Reranker model 'cross-encoder/ms-marco-MiniLM-L-6-v2' loaded.


In [19]:
def hybrid_search_and_rerank(query: str, bm25_k: int = 20, faiss_k: int = 20, rerank_top_n: int = 5):
    # 1. Get results from BM25
    bm25_res = search_bm25(query, k=bm25_k)
    
    # 2. Get results from FAISS
    faiss_res = search_faiss(query, k=faiss_k)
    
    # 3. Combine and deduplicate, preserving original scores
    combined_results_dict = {} # Use dict to deduplicate by chunk_id
    
    for res_item in bm25_res:
        chunk_id = res_item['chunk_id']
        if chunk_id not in combined_results_dict:
            combined_results_dict[chunk_id] = res_item.copy()
            combined_results_dict[chunk_id]['bm25_score'] = res_item['score']
            if 'score' in combined_results_dict[chunk_id]: combined_results_dict[chunk_id].pop('score')
        else:
            combined_results_dict[chunk_id]['bm25_score'] = res_item['score']


    for res_item in faiss_res:
        chunk_id = res_item['chunk_id']
        if chunk_id not in combined_results_dict:
            combined_results_dict[chunk_id] = res_item.copy()
            combined_results_dict[chunk_id]['faiss_score'] = res_item['score']
            if 'score' in combined_results_dict[chunk_id]: combined_results_dict[chunk_id].pop('score')
        else:
            combined_results_dict[chunk_id]['faiss_score'] = res_item['score']
            
    candidate_chunks = list(combined_results_dict.values())
    
    if not candidate_chunks:
        print("No candidate chunks to rerank.") # Added print
        return []
        
    # 4. Prepare pairs for reranker
    reranker_pairs = []
    for chunk_info in candidate_chunks:
        reranker_pairs.append([query, chunk_info['text']])
    
    # DEBUG: Print a sample pair
    if reranker_pairs:
        print(f"Sample reranker_pair[0]: Query='{reranker_pairs[0][0][:50]}...', Text='{reranker_pairs[0][1][:50]}...'")
            
   # 5. Rerank
    # Remove .device from this print statement:
    print(f"Reranking {len(reranker_pairs)} candidate pairs...") # Corrected print
    rerank_scores = [] # Initialize
    if reranker_pairs:
        try:
            rerank_scores = reranker_model.predict(
                reranker_pairs, 
                show_progress_bar=True, # Turn on progress bar for reranker
                batch_size=32 # Try a smaller batch size for reranker initially
            )
            # ---- DEBUG PRINTS ----
            print(f"DEBUG: Type of rerank_scores: {type(rerank_scores)}")
            if hasattr(rerank_scores, 'shape'):
                print(f"DEBUG: Shape of rerank_scores: {rerank_scores.shape}")
            else:
                print(f"DEBUG: Length of rerank_scores (if list): {len(rerank_scores) if isinstance(rerank_scores, list) else 'Not a list/array with len'}")
            print(f"DEBUG: First 5 rerank_scores: {rerank_scores[:5] if isinstance(rerank_scores, (list, np.ndarray)) and len(rerank_scores) > 0 else 'N/A or Empty'}")
            # ---- END DEBUG PRINTS ----
        except Exception as e:
            print(f"ERROR during reranker_model.predict: {e}")
            rerank_scores = [] # Ensure it's an empty list on error
    else:
        print("No pairs to rerank.")

    # 6. Add rerank scores and sort
    # Ensure rerank_scores is a list/array of numbers
    if not isinstance(rerank_scores, (list, np.ndarray)) or not all(isinstance(x, (int, float, np.number)) for x in rerank_scores):
        print(f"Warning: rerank_scores is not a list/array of numbers. Got: {rerank_scores}")
        # Fill with default low scores if it's not in the expected format
        rerank_scores = [-float('inf')] * len(candidate_chunks)


    for i, chunk_info in enumerate(candidate_chunks):
        if i < len(rerank_scores):
            chunk_info['rerank_score'] = float(rerank_scores[i]) # Ensure it's a float
        else:
            # This case should ideally not happen if len(rerank_scores) == len(candidate_chunks)
            print(f"Warning: Mismatch in lengths. i={i}, len(rerank_scores)={len(rerank_scores)}")
            chunk_info['rerank_score'] = -float('inf')

    reranked_results = sorted(candidate_chunks, key=lambda x: x.get('rerank_score', -float('inf')), reverse=True)
    
    return reranked_results[:rerank_top_n]

# Test Hybrid Search + Rerank (in the same cell or a new one)
test_query_hybrid = "Tell me about the impact of COVID-19 on global supply chains."
hybrid_reranked_results = hybrid_search_and_rerank(test_query_hybrid, bm25_k=10, faiss_k=10, rerank_top_n=3)

print(f"\nHybrid + Reranked Results for: '{test_query_hybrid}'")
for res in hybrid_reranked_results:
    # Prepare score strings carefully
    rerank_score_str = f"{res['rerank_score']:.4f}" if isinstance(res.get('rerank_score'), (int, float)) else "N/A"
    bm25_score_val = res.get('bm25_score')
    bm25_score_str = f"{bm25_score_val:.2f}" if isinstance(bm25_score_val, (int, float)) else "N/A"
    faiss_score_val = res.get('faiss_score')
    faiss_score_str = f"{faiss_score_val:.2f}" if isinstance(faiss_score_val, (int, float)) else "N/A"

    print(f"  Rerank Score: {rerank_score_str} | BM25: {bm25_score_str} | FAISS: {faiss_score_str}")
    print(f"  Title: {res['title'][:50]}... | Chunk: {res['text'][:150]}...")
    print(f"  URL: {res.get('url', 'N/A')}")
    print("-" * 30)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Sample reranker_pair[0]: Query='Tell me about the impact of COVID-19 on global sup...', Text='the United States and China specifically. It's a c...'
Reranking 20 candidate pairs...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Type of rerank_scores: <class 'numpy.ndarray'>
DEBUG: Shape of rerank_scores: (20,)
DEBUG: First 5 rerank_scores: [-3.0371718  6.472589  -1.5158455 -1.6148225 -1.8780392]

Hybrid + Reranked Results for: 'Tell me about the impact of COVID-19 on global supply chains.'
  Rerank Score: 6.4726 | BM25: 33.24 | FAISS: N/A
  Title: Coronavirus: At least 150 companies have warned in... | Chunk: return to full production, and those delays will have a significant impact on freight volumes moved in Q1." It added that is also may further impact g...
  URL: https://www.cnbc.com/2020/03/11/coronavirus-at-least-150-companies-have-warned-investors.html
------------------------------
  Rerank Score: 6.1507 | BM25: N/A | FAISS: N/A
  Title: COVID-19 is attacking our defense supply chains an... | Chunk: COVID-19 is attacking our defense supply chains and our nation's security | TheHill. COVID-19 is leading to rolling shutdowns of commerce across the c...
  URL: https://thehill.com/opinion/national-

In [21]:
# --- LLM Parameters ---
# We are switching to a smaller, very capable model that doesn't require complex quantization.
LLM_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"

# We don't need the BitsAndBytesConfig anymore, which was causing the error.

try:
    print(f"Loading LLM: {LLM_MODEL_NAME}...")
    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
    
    # Load the model directly to the device. 'torch_dtype="auto"' helps with performance.
    llm_model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_NAME,
        torch_dtype="auto", # Automatically uses bfloat16 on compatible GPUs
        device_map="auto",  # Automatically maps to GPU if available
        trust_remote_code=True # Required for Phi-3 model architecture
    )
    llm_model.eval() # Set to evaluation mode
    print("LLM loaded successfully.")

except Exception as e:
    print(f"Error loading LLM: {e}")
    print("This might be a VRAM issue. If so, try restarting the kernel and running again.")
    llm_model = None
    llm_tokenizer = None

# Set pad token if not set, common for generative models
if llm_tokenizer and llm_tokenizer.pad_token is None:
    # Llama-3 and Phi-3 use the EOS token as the PAD token.
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    if hasattr(llm_model, 'config'):
         llm_model.config.pad_token_id = llm_model.config.eos_token_id

Loading LLM: microsoft/Phi-3-mini-4k-instruct...


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

LLM loaded successfully.


In [39]:
def format_rag_prompt(query: str, context_chunks: list, llm_tokenizer_for_chat=None):
    context_str = "\n\n---\n\n".join([chunk['text'] for chunk in context_chunks])

    # Truncate context if too long for the LLM (crude truncation)
    # A more sophisticated approach would tokenize and count, or summarize
    # max_context_len_chars = 7000 # Rough estimate, depends on LLM context window & tokenization
    # if len(context_str) > max_context_len_chars:
    #     context_str = context_str[:max_context_len_chars] + "..."


    # Using a chat format is generally better for instruct/chat models
    system_message = "You are a helpful AI assistant. Answer the user's QUESTION based *only* on the provided CONTEXT. If the context does not contain the answer, say 'I cannot answer the question based on the provided context.' Do not use any prior knowledge. Be concise and directly answer the question."
    
    user_message_content = f"CONTEXT:\n{context_str}\n\nQUESTION: {query}"

    if llm_tokenizer_for_chat: # For Hugging Face Transformers
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message_content}
        ]
        try:
            prompt = llm_tokenizer_for_chat.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            return prompt, system_message, user_message_content # Return components for Ollama
        except Exception as e:
            print(f"Error applying chat template: {e}. Falling back to basic prompt.")
            # Fallback basic prompt if chat template fails (less ideal)
            prompt = f"System: {system_message}\nUser: {user_message_content}\nAssistant:"
            return prompt, system_message, user_message_content
    else: # For manual construction (e.g. if directly using Ollama's non-chat or if tokenizer has no template)
         # This part is more for Ollama now, where we send system and user prompts separately
        return None, system_message, user_message_content


def generate_llm_answer(query: str, context_chunks: list):
    if not context_chunks:
        return "No relevant context found to answer the question.", []

    # Option A: Using Hugging Face Transformers LLM
    if 'llm_model' in globals() and llm_model is not None and 'llm_tokenizer' in globals() and llm_tokenizer is not None:
        formatted_prompt, _, _ = format_rag_prompt(query, context_chunks, llm_tokenizer_for_chat=llm_tokenizer)
        
        inputs = llm_tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=4000).to(DEVICE)
        
        generation_args = {
            "max_new_tokens": 250,
            "temperature": 0.1,
            "do_sample": True,
            "top_p": 0.9,
            "eos_token_id": llm_tokenizer.eos_token_id,
            "pad_token_id": llm_tokenizer.pad_token_id if llm_tokenizer.pad_token_id is not None else llm_tokenizer.eos_token_id
        }
        
        with torch.no_grad():
            output_ids = llm_model.generate(**inputs, **generation_args)
        
        answer = llm_tokenizer.decode(output_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
        
        # --- THIS IS THE CORRECTED RETURN STATEMENT ---
        return answer.strip(), [chunk['text'] for chunk in context_chunks]

    # Option B: Using Ollama Chat
    elif 'OLLAMA_CHAT_MODEL_NAME' in globals():
        _, system_prompt, user_prompt_content = format_rag_prompt(query, context_chunks)
        answer = query_ollama_chat(system_prompt, user_prompt_content, model_name=OLLAMA_CHAT_MODEL_NAME)
        
        # --- THIS IS THE CORRECTED RETURN STATEMENT FOR OLLAMA ---
        return answer.strip(), [chunk['text'] for chunk in context_chunks]
    
    else:
        return "LLM not available or configured.", []

# Test RAG pipeline
rag_query = "What are the main arguments for and against Brexit?"
# Get context from hybrid search
retrieved_context_chunks = hybrid_search_and_rerank(rag_query, bm25_k=15, faiss_k=15, rerank_top_n=3)

print(f"\n--- RAG Pipeline Test ---")
print(f"Query: {rag_query}")
if retrieved_context_chunks:
    print(f"Top {len(retrieved_context_chunks)} contexts passed to LLM:")
    for i, chunk_info in enumerate(retrieved_context_chunks):
        print(f"  Context {i+1} (Rerank Score: {chunk_info['rerank_score']:.3f}): {chunk_info['text'][:200]}...")
    
    # --- THIS IS THE CORRECTED LINE ---
    llm_answer, used_context_texts = generate_llm_answer(rag_query, retrieved_context_chunks)
    
    print(f"\nLLM Answer:\n{llm_answer}")
else:
    print("No context retrieved for the LLM.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Sample reranker_pair[0]: Query='What are the main arguments for and against Brexit...', Text='EU referendum: why Brexit is a good idea. For the ...'
Reranking 30 candidate pairs...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Type of rerank_scores: <class 'numpy.ndarray'>
DEBUG: Shape of rerank_scores: (30,)
DEBUG: First 5 rerank_scores: [ 4.54578    5.7194633 -2.2526562 -1.3843509 -3.2141771]

--- RAG Pipeline Test ---
Query: What are the main arguments for and against Brexit?
Top 3 contexts passed to LLM:
  Context 1 (Rerank Score: 7.975): and national security a large part of the argument for remaining in. The strongest argument for Brexit is that it is the only way to restore sovereignty to Parliament and escape the jurisdiction of th...
  Context 2 (Rerank Score: 6.857): the government in protest over May’s handling of negotiations with the European Union. Here’s what’s going on: In June 2016 the UK held a referendum on whether to stay in the EU or cut ties with the p...
  Context 3 (Rerank Score: 5.719): of 2012, according to the Office for National Statistics. Now, economists say the Brexit vote will likely put the brakes on what little forward momentum was left for British economic growth. Th

In [29]:
import pickle

# --- Create a directory to save everything ---
ARTIFACTS_DIR = Path("/kaggle/working/rag_artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True)

print(f"Saving artifacts to: {ARTIFACTS_DIR}")

# 1. Save the final sampled DataFrame
df.to_parquet(ARTIFACTS_DIR / "final_df.parquet")
print("✅ Saved final_df.parquet")

# 2. Save the chunks DataFrame
chunks_df.to_parquet(ARTIFACTS_DIR / "chunks_df.parquet")
print("✅ Saved chunks_df.parquet")

# 3. Save the BM25 index object using pickle
with open(ARTIFACTS_DIR / "bm25_index.pkl", "wb") as f:
    pickle.dump(bm25, f)
print("✅ Saved bm25_index.pkl")


if 'chunk_embeddings_cpu' in locals():
    np.save(ARTIFACTS_DIR / "chunk_embeddings.npy", chunk_embeddings_cpu)
    print("✅ Saved chunk_embeddings.npy")
else:
    print("⚠️ 'chunk_embeddings_cpu' not found in memory, skipping save. Re-run embedding cell if needed.")


# 5. Save the FAISS index
faiss.write_index(index_faiss, str(ARTIFACTS_DIR / "news_chunks.faiss_index"))
print("✅ Saved news_chunks.faiss_index")

print("\nAll artifacts saved successfully!")

Saving artifacts to: /kaggle/working/rag_artifacts
✅ Saved final_df.parquet
✅ Saved chunks_df.parquet
✅ Saved bm25_index.pkl
✅ Saved chunk_embeddings.npy
✅ Saved news_chunks.faiss_index

All artifacts saved successfully!


In [31]:
import pickle


ARTIFACTS_DIR = Path("/kaggle/working/rag_artifacts") 
# --- Check if artifacts exist and load them ---
if ARTIFACTS_DIR.exists() and (ARTIFACTS_DIR / "final_df.parquet").exists():
    print(f"✅ Found artifacts directory. Loading pre-computed files from: {ARTIFACTS_DIR}")
    
    # 1. Load DataFrames
    df = pd.read_parquet(ARTIFACTS_DIR / "final_df.parquet")
    chunks_df = pd.read_parquet(ARTIFACTS_DIR / "chunks_df.parquet")
    print(f"Loaded {len(df)} documents and {len(chunks_df)} chunks.")

    # 2. Load BM25 index
    with open(ARTIFACTS_DIR / "bm25_index.pkl", "rb") as f:
        bm25 = pickle.load(f)
    print("Loaded BM25 index.")

    # 3. Load FAISS index
    index_faiss = faiss.read_index(str(ARTIFACTS_DIR / "news_chunks.faiss_index"))
    print(f"Loaded FAISS index with {index_faiss.ntotal} vectors.")
    
    # chunk_embeddings_cpu = np.load(ARTIFACTS_DIR / "chunk_embeddings.npy")
    # print("Loaded chunk embeddings.")

else:
    print(f"❌ Artifacts not found at {ARTIFACTS_DIR}. Running full data processing pipeline...")
    


✅ Found artifacts directory. Loading pre-computed files from: /kaggle/working/rag_artifacts
Loaded 30000 documents and 278947 chunks.
Loaded BM25 index.
Loaded FAISS index with 278947 vectors.


In [32]:
import os

artifacts_dir = "/kaggle/working/rag_artifacts"


output_zip_file = "/kaggle/working/rag_artifacts.zip"

print(f"Zipping the directory: {artifacts_dir}")


os.system(f"zip -r {output_zip_file} {artifacts_dir}")

print(f"Successfully created zip file at: {output_zip_file}")
print("You can now find 'rag_artifacts.zip' in the Output section of the Data sidebar and download it.")

Zipping the directory: /kaggle/working/rag_artifacts
  adding: kaggle/working/rag_artifacts/ (stored 0%)
  adding: kaggle/working/rag_artifacts/news_chunks.faiss_index (deflated 7%)
  adding: kaggle/working/rag_artifacts/bm25_index.pkl (deflated 69%)
  adding: kaggle/working/rag_artifacts/chunk_embeddings.npy (deflated 7%)
  adding: kaggle/working/rag_artifacts/final_df.parquet (deflated 12%)
  adding: kaggle/working/rag_artifacts/chunks_df.parquet (deflated 15%)
Successfully created zip file at: /kaggle/working/rag_artifacts.zip
You can now find 'rag_artifacts.zip' in the Output section of the Data sidebar and download it.
