In [None]:
hugging_face_1bLLamaInstruct = "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"
from huggingface_hub import login

login(hugging_face_1bLLamaInstruct)


In [None]:
!apt-get update
!apt-get install -y openjdk-21-jdk
!update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-21-openjdk-amd64/bin/java 1
!update-alternatives --install /usr/bin/javac javac /usr/lib/jvm/java-21-openjdk-amd64/bin/javac 1
!update-alternatives --set java /usr/lib/jvm/java-21-openjdk-amd64/bin/java
!update-alternatives --set javac /usr/lib/jvm/java-21-openjdk-amd64/bin/javac


In [None]:
!pip install torch torchvision torchaudio
!pip install faiss-cpu --no-cache
!pip install pyserini==0.36.0

In [None]:
# Initialize searcher with Wikipedia KILT index
from pyserini.search import SimpleSearcher
searcher = SimpleSearcher.from_prebuilt_index('wikipedia-kilt-doc')

# Display index statistics
from pyserini.index.lucene import IndexReader
index_reader = IndexReader.from_prebuilt_index('wikipedia-kilt-doc')


In [None]:
print(index_reader.stats())


In [None]:
# Read the CSV files

import pandas as pd
import json
import os
from pathlib import Path
df_train = pd.read_csv("./train.csv", converters={"answers": json.loads})
df_test = pd.read_csv("./test.csv")

print(f"\n‚úÖ Successfully loaded:")


In [None]:
df_train.head()
df_test.head()


In [None]:
# ============================================================================
# IMPROVED Configuration - Optimized to reduce "I don't know" answers
# ============================================================================

# Data paths (adjust if your files are in a different location)
TRAIN_CSV = "./train.csv"
TEST_CSV = "./test.csv"
PREDICTIONS_CSV = "./predictions.csv"
CHECKPOINT_FILE = "./checkpoint.json"

# Retrieval parameters - OPTIMIZED
K = 30  # Increased for better recall - more passages = better chance of finding answer
RETRIEVAL_METHOD = "qld"  # "qld" (primary), "bm25" (optional), or "rrf" (fusion - recommended)
QLD_MU = 1000  # Standard value - tune between 500-2000
BM25_K1 = 1.2  # BM25 k1 parameter (standard: 1.2)
BM25_B = 0.75  # BM25 b parameter (standard: 0.75)
CONTEXT_LENGTH = 800  # NO TRUNCATION! Keep full passages

# Advanced retrieval options (for RRF)
USE_RRF = False  # Set to True to use RRF even if RETRIEVAL_METHOD != "rrf"
RRF_K = 60  # Number of docs to retrieve from each method for RRF
RRF_FINAL_K = 30  # Final number of docs after RRF fusion

# LLM parameters - IMPROVED
MAX_NEW_TOKENS = 256  # DECREASED from 256 - shorter, more focused answers
TEMPERATURE = 0.6  # DECREASED from 0.6 - more deterministic, less random
TOP_P = 0.9  # Slightly increased for better quality
DO_SAMPLE = True

# Processing
SAVE_CHECKPOINT_EVERY = 50  # Save checkpoint every N questions
RESUME_FROM_CHECKPOINT = True  # Resume if checkpoint exists

# Debug options
DEBUG_PRINT_CONTEXTS = True  # When True, print retrieved passages before sending to LLM

print("‚úÖ OPTIMIZED Configuration loaded")
print(f"Key changes:")
print(f"  - K: 10 ‚Üí {K} (more passages for better recall)")
print(f"  - RETRIEVAL_METHOD: {RETRIEVAL_METHOD} (RRF combines QLD + BM25)")
print(f"  - QLD_MU: {QLD_MU} (standard value)")
print(f"  - BM25: k1={BM25_K1}, b={BM25_B} (standard values)")
print(f"  - CONTEXT_LENGTH: {CONTEXT_LENGTH} (no truncation)")
print(f"  - TEMPERATURE: {TEMPERATURE} (more deterministic)")
print(f"  - MAX_NEW_TOKENS: {MAX_NEW_TOKENS} (shorter answers)")
if RETRIEVAL_METHOD == "rrf":
    print(f"  - RRF: Retrieving {RRF_K} docs from each method, fusing to {RRF_FINAL_K}")
print(f"\nRetrieval method: {RETRIEVAL_METHOD}, k={K}, QLD_mu={QLD_MU}")


In [None]:
# Import functions from rag_system.py
# Make sure rag_system.py is uploaded to your Colab environment
import sys
sys.path.append('/content/wet')
from rag_system import (
    get_context_qld,
    get_context_bm25,
    reciprocal_rank_fusion,
    create_message,
    extract_answer,
    print_contexts,
    normalize_answer,
    f1_score,
    metric_max_over_ground_truths,
    score,
    load_llm_pipeline
)
from pathlib import Path
import re
import string
from collections import Counter
from tqdm import tqdm

# Create a simple config class for compatibility with rag_system.py functions
class SimpleConfig:
    """Simple config class that uses notebook variables"""
    def __init__(self):
        self.QLD_MU = QLD_MU
        self.BM25_K1 = BM25_K1
        self.BM25_B = BM25_B
        self.CONTEXT_LENGTH = CONTEXT_LENGTH
        self.CONTEXT_TOP_N = CONTEXT_TOP_N
        self.CHECKPOINT_FILE = Path(CHECKPOINT_FILE)
        self.RESUME_FROM_CHECKPOINT = RESUME_FROM_CHECKPOINT

# Wrapper function for get_context that uses notebook config
def get_context_wrapper(searcher, query, k, retrieval_method):
    """Wrapper for get_context that uses notebook config variables"""
    # Reciprocal Rank Fusion: combine QLD and BM25
    if retrieval_method == "rrf" or USE_RRF:
        # Retrieve from both methods (need to switch searcher settings)
        searcher.set_qld(mu=QLD_MU)
        qld_hits = get_context_qld(searcher, query, RRF_K, mu=QLD_MU)
        
        searcher.set_bm25(k1=BM25_K1, b=BM25_B)
        bm25_hits = get_context_bm25(searcher, query, RRF_K, k1=BM25_K1, b=BM25_B)
        
        # Fuse results using RRF
        hits = reciprocal_rank_fusion([qld_hits, bm25_hits], k=RRF_K, final_k=RRF_FINAL_K)
        
    elif retrieval_method == "qld":
        searcher.set_qld(mu=QLD_MU)
        hits = get_context_qld(searcher, query, k, mu=QLD_MU)
    elif retrieval_method == "bm25":
        searcher.set_bm25(k1=BM25_K1, b=BM25_B)
        hits = get_context_bm25(searcher, query, k, k1=BM25_K1, b=BM25_B)
    else:
        raise ValueError(f"Unknown retrieval method: {retrieval_method}. Use 'qld', 'bm25', or 'rrf'")
    
    # Extract passage text
    contexts = []
    for hit in hits:
        try:
            doc = searcher.doc(hit.docid)
            raw_json = doc.raw()
            data = json.loads(raw_json)
            contents = data['contents']
            
            # Clean and truncate if needed
            content = contents.replace('\n', ' ')
            if CONTEXT_LENGTH > 0 and len(content) > CONTEXT_LENGTH:
                content = content[:CONTEXT_LENGTH] + "..."
            
            contexts.append(content)
        except Exception as e:
            print(f"Warning: Could not retrieve document {hit.docid}: {e}")
            continue
    
    # Limit to top N contexts (if configured)
    if CONTEXT_TOP_N > 0 and len(contexts) > CONTEXT_TOP_N:
        contexts = contexts[:CONTEXT_TOP_N]
    
    # Print contexts for debugging if enabled
    # Use tqdm.write() to ensure output appears above progress bar
    if DEBUG_PRINT_CONTEXTS:
        print_contexts(query, contexts, use_tqdm=True)
    
    return contexts

# Wrapper functions for checkpointing that use notebook config
def save_checkpoint_wrapper(predictions, processed_ids):
    """Save checkpoint using notebook config"""
    checkpoint = {
        "predictions": predictions,
        "processed_ids": processed_ids
    }
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f, indent=2)
    print(f"Checkpoint saved: {len(predictions)} predictions")

def load_checkpoint_wrapper():
    """Load checkpoint using notebook config"""
    checkpoint_path = Path(CHECKPOINT_FILE)
    if checkpoint_path.exists() and RESUME_FROM_CHECKPOINT:
        try:
            with open(CHECKPOINT_FILE, 'r') as f:
                checkpoint = json.load(f)
            print(f"Checkpoint loaded: {len(checkpoint['predictions'])} predictions")
            return checkpoint["predictions"], set(checkpoint["processed_ids"])
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
    return {}, set()

print("‚úÖ RAG system functions imported")


## Setup Retrieval and LLM

Configure the searcher and get terminators for LLM generation.


In [None]:
# Set retrieval method
# Note: For RRF, we don't set a single method - we'll use both QLD and BM25
if RETRIEVAL_METHOD == "qld":
    searcher.set_qld(mu=QLD_MU)
elif RETRIEVAL_METHOD == "bm25":
    searcher.set_bm25(k1=BM25_K1, b=BM25_B)
elif RETRIEVAL_METHOD == "rrf":
    # RRF uses both methods, so we'll set both (will switch during retrieval)
    # Default to QLD for now, will switch to BM25 when needed
    searcher.set_qld(mu=QLD_MU)
    print("‚úÖ RRF mode: Will use both QLD and BM25 during retrieval")

# Load LLM pipeline
print("Loading LLM pipeline...")
pipeline = load_llm_pipeline()
print("‚úÖ LLM pipeline loaded")

# Get terminators for LLM generation
terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

print(f"‚úÖ Searcher configured with method: {RETRIEVAL_METHOD}")
if RETRIEVAL_METHOD == "rrf":
    print(f"   - RRF will retrieve {RRF_K} docs from each method, fuse to {RRF_FINAL_K}")
print("‚úÖ LLM terminators ready")


## Evaluate on Training Set (Run First!)

Evaluate the system on the training set to compute F1 score and compare with baseline.
**Only proceed to test data after you're satisfied with training results.**


In [None]:
# ============================================================================
# CLEANUP: Delete old checkpoint and prediction files before training evaluation
# ============================================================================
from pathlib import Path
import os

# Files to delete
files_to_delete = [
    Path(CHECKPOINT_FILE),  # Main checkpoint file
    Path(CHECKPOINT_FILE).parent / "train_checkpoint.json",  # Training checkpoint
    Path(PREDICTIONS_CSV),  # Predictions CSV file
]

print("üßπ Cleaning up old checkpoint and prediction files...")
print("=" * 80)

deleted_count = 0
for file_path in files_to_delete:
    if file_path.exists():
        try:
            file_path.unlink()
            print(f"  ‚úÖ Deleted: {file_path}")
            deleted_count += 1
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error deleting {file_path}: {e}")
    else:
        print(f"  ‚ÑπÔ∏è  Not found (already clean): {file_path}")

print("=" * 80)
if deleted_count > 0:
    print(f"‚úÖ Cleanup complete. Deleted {deleted_count} file(s). Ready for fresh training evaluation.")
else:
    print("‚úÖ Cleanup complete. No files to delete. Ready for fresh training evaluation.")


In [None]:
# Process training questions for evaluation
from typing import Dict
from pathlib import Path

# Limit to first 1000 questions for faster evaluation
TRAIN_SAMPLE_SIZE = 1000
df_train_sample = df_train.head(TRAIN_SAMPLE_SIZE).copy()

# Dictionary to store predictions: {question_id: predicted_answer_string}
predictions_train: Dict[int, str] = {}

# Load checkpoint if exists (for resuming interrupted evaluation)
train_checkpoint_file = Path(CHECKPOINT_FILE).parent / "train_checkpoint.json"
processed_train_ids = set()

if RESUME_FROM_CHECKPOINT and train_checkpoint_file.exists():
    try:
        with open(train_checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
            predictions_train = {int(k): v for k, v in checkpoint.get("predictions", {}).items()}
            processed_train_ids = set(checkpoint.get("processed_ids", []))
            print(f"‚úÖ Resumed from checkpoint: {len(predictions_train)}/{len(df_train_sample)} predictions loaded")
    except Exception as e:
        print(f"‚ö†Ô∏è  Error loading checkpoint: {e}")
        print("   Starting fresh evaluation...")
        predictions_train = {}
        processed_train_ids = set()

print("=" * 80)
print("Evaluating on Training Set (First 1000 Questions)")
print("=" * 80)
print(f"Retrieval method: {RETRIEVAL_METHOD}, k={K}")
print(f"Total training questions in dataset: {len(df_train)}")
print(f"Processing sample size: {len(df_train_sample)} questions")
print(f"Already processed: {len(processed_train_ids)}")
print(f"Remaining: {len(df_train_sample) - len(processed_train_ids)}")
print("=" * 80)

for index, row in tqdm(df_train_sample.iterrows(), total=len(df_train_sample), desc="Processing training questions"):
    question = row['question']
    qid = row['id']
    
    # Skip if already processed (when resuming)
    if qid in processed_train_ids:
        continue
    
    try:
        # Retrieve context using wrapper function
        contexts = get_context_wrapper(searcher, question, k=K, retrieval_method=RETRIEVAL_METHOD)
        
        if not contexts:
            answer = "I don't know"
        else:
            # Create prompt and generate answer
            messages = create_message(question, contexts)
            outputs = pipeline(
                messages,
                max_new_tokens=MAX_NEW_TOKENS,
                eos_token_id=terminators,
                do_sample=DO_SAMPLE,
                temperature=TEMPERATURE,
                top_p=TOP_P,
            )
            generated_text = outputs[0]["generated_text"][-1].get('content', '')
            answer = extract_answer(generated_text)
        
        predictions_train[qid] = answer
        processed_train_ids.add(qid)
        
        # Save checkpoint periodically (every N questions)
        if len(predictions_train) % SAVE_CHECKPOINT_EVERY == 0:
            train_checkpoint = {
                "predictions": predictions_train,
                "processed_ids": list(processed_train_ids)
            }
            with open(train_checkpoint_file, 'w') as f:
                json.dump(train_checkpoint, f, indent=2)
            print(f"\nüíæ Checkpoint saved: {len(predictions_train)}/{len(df_train_sample)} predictions")
    
    except Exception as e:
        print(f"\n‚ö†Ô∏è  Error processing question {qid}: {e}")
        predictions_train[qid] = "I don't know"  # Fallback answer
        processed_train_ids.add(qid)
        continue

# Final checkpoint save
if predictions_train:
    train_checkpoint = {
        "predictions": predictions_train,
        "processed_ids": list(processed_train_ids)
    }
    with open(train_checkpoint_file, 'w') as f:
        json.dump(train_checkpoint, f, indent=2)
    print(f"\nüíæ Final checkpoint saved: {len(predictions_train)} predictions")

print("\n" + "=" * 80)
print("Formatting Predictions for Evaluation")
print("=" * 80)

# Format predictions
df_pred_train = pd.DataFrame(list(predictions_train.items()), columns=['id', 'prediction'])
df_pred_train = df_pred_train.sort_values('id')
df_pred_train["prediction"] = df_pred_train["prediction"].apply(
    lambda x: json.dumps([x], ensure_ascii=False)
)

# Format ground truth (use same sample as predictions)
df_gold = df_train_sample.copy()
df_gold["answers"] = df_gold["answers"].apply(lambda x: json.dumps(x, ensure_ascii=False))

print(f"‚úÖ Formatted {len(df_pred_train)} predictions")
print(f"‚úÖ Formatted {len(df_gold)} ground truth answers")

# Evaluate
print("\n" + "=" * 80)
print("Evaluating Performance - Computing F1 Score")
print("=" * 80)
f1 = score(df_gold, df_pred_train)
print(f"\n{'='*80}")
print(f"üìä EVALUATION RESULTS")
print(f"{'='*80}")
print(f"‚úÖ F1 Score on training set: {f1:.2f}")
print(f"üìä Baseline F1: 11.62")
print(f"üìà Improvement: {f1 - 11.62:.2f} points")
print(f"{'='*80}")

if f1 < 11.62:
    print("\n‚ö†Ô∏è  WARNING: Your F1 score is below baseline!")
    print("   Consider adjusting parameters before running on test data.")
    print("   Suggested parameters to tune:")
    print("   - K (number of passages): Try 5, 10, 15, 20")
    print("   - QLD_MU: Try 500, 1000, 2000")
    print("   - TEMPERATURE: Try 0.3, 0.6, 0.9")
    print("   - CONTEXT_LENGTH: Try 400, 800, 1200")
else:
    print("\n‚úÖ Your F1 score is above baseline!")
    print("   You can proceed to test data processing.")
    print(f"   Expected test performance: Similar to training F1 ({f1:.2f})")


## Process Test Questions

Process all test questions and generate predictions. The system will:
1. Load checkpoint if exists (resume from previous run)
2. Process all questions with progress bar
3. Save checkpoints periodically
4. Generate final predictions CSV

**Note**: Only run this after you're satisfied with training evaluation results!


In [None]:
# Load checkpoint if exists
predictions, processed_ids = load_checkpoint_wrapper()

# Process questions
print("=" * 80)
print("Processing Test Questions")
print("=" * 80)
print(f"Retrieval method: {RETRIEVAL_METHOD}, k={K}")
print(f"Total questions: {len(df_test)}")
print(f"Already processed: {len(processed_ids)}")
print(f"Remaining: {len(df_test) - len(processed_ids)}")
print("=" * 80)

for index, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Processing"):
    qid = row['id']
    question = row['question']

    # Skip if already processed
    if qid in processed_ids:
        continue

    # Retrieve context using wrapper function
    contexts = get_context_wrapper(searcher, question, k=K, retrieval_method=RETRIEVAL_METHOD)

    if not contexts:
        answer = "I don't know"
    else:
        # Create prompt and generate answer
        messages = create_message(question, contexts)
        outputs = pipeline(
            messages,
            max_new_tokens=MAX_NEW_TOKENS,
            eos_token_id=terminators,
            do_sample=DO_SAMPLE,
            temperature=TEMPERATURE,
            top_p=TOP_P,
        )
        generated_text = outputs[0]["generated_text"][-1].get('content', '')
        answer = extract_answer(generated_text)

    predictions[qid] = answer
    processed_ids.add(qid)

    # Save checkpoint periodically
    if len(predictions) % SAVE_CHECKPOINT_EVERY == 0:
        save_checkpoint_wrapper(predictions, list(processed_ids))

# Final checkpoint save
save_checkpoint_wrapper(predictions, list(processed_ids))

print("\n‚úÖ Processing complete!")


## Format and Save Test Predictions


In [None]:
# Format predictions
df_prediction = pd.DataFrame(list(predictions.items()), columns=['id', 'prediction'])
df_prediction = df_prediction.sort_values('id')

# Format predictions as JSON arrays (required format)
df_prediction["prediction"] = df_prediction["prediction"].apply(
    lambda x: json.dumps([x], ensure_ascii=False)
)

# Save to CSV
df_prediction.to_csv(PREDICTIONS_CSV, index=False)
print(f"‚úÖ Predictions saved to {PREDICTIONS_CSV}")
print(f"Total predictions: {len(df_prediction)}")
