In [None]:
# Import necessary modules
import sys
sys.path.append('../src')

from data_collection import preprocess_corpus
from dense_retrieval import DenseRetriever
from sparse_retrieval import SparseRetriever
from rrf_fusion import reciprocal_rank_fusion
from generation import Generator
import json

In [None]:
# Load and preprocess data
with open('../data/fixed_urls.json', 'r') as f:
    fixed_urls = json.load(f)

# Add random URLs
random_urls = [f"https://en.wikipedia.org/wiki/Random_{i}" for i in range(10)]  # Small sample
all_urls = fixed_urls[:10] + random_urls  # Small corpus for testing

corpus = preprocess_corpus(all_urls)
print(f"Corpus size: {len(corpus)}")

In [None]:
# Test dense retrieval
dense_retriever = DenseRetriever()
dense_retriever.build_index(corpus)
query = "What is artificial intelligence?"
dense_results = dense_retriever.retrieve(query, 5)
print("Dense results:", dense_results)

In [None]:
# Test sparse retrieval
sparse_retriever = SparseRetriever()
sparse_retriever.build_index(corpus)
sparse_results = sparse_retriever.retrieve(query, 5)
print("Sparse results:", sparse_results)

In [None]:
# Test RRF fusion
fused = reciprocal_rank_fusion(dense_results, sparse_results)
print("Fused results:", fused)

In [None]:
# Test generation
generator = Generator()
context = [chunk for chunk, _ in fused]
answer = generator.generate(query, context)
print("Generated answer:", answer)