In [1]:
import os
import requests
import json
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from bs4 import BeautifulSoup
import xmltodict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
# Device Configuration (Use MPS for Apple M1/M2, fallback to CPU)
device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [3]:
def load_models():
    """Load models for embedding generation and reranking."""
    tokenizer = AutoTokenizer.from_pretrained("model_cache/BAAI_bge-m3_tokenizer")
    embedding_model = AutoModel.from_pretrained("model_cache/BAAI_bge-m3").to(device)
    rerank_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-large")
    rerank_model = AutoModelForSequenceClassification.from_pretrained("model_cache/BAAI_bge-reranker-large").to(device)

    rerank_tokenizer.save_pretrained("model_cache/BAAI_bge-reranker-large")

    return tokenizer, embedding_model, rerank_tokenizer, rerank_model
# tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-large")


tokenizer, embedding_model, rerank_tokenizer, rerank_model = load_models()



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at model_cache/BAAI_bge-reranker-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def fetch_sitemap(sitemap_url: str) -> list:
    """Fetch URLs from a sitemap."""
    try:
        response = requests.get(sitemap_url, timeout=10)
        response.raise_for_status()
        sitemap_dict = xmltodict.parse(response.content)
        return [url['loc'] for url in sitemap_dict['urlset']['url']]
    except Exception as e:
        print(f"Error fetching sitemap: {e}")
        return []

In [8]:
def fetch_content(url: str) -> str:
    """Fetch and clean webpage content."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Clean HTML by removing unnecessary elements
        for tag in ["script", "style", "meta", "noscript", "header", "footer", "nav"]:
            for element in soup.find_all(tag):
                element.decompose()

        text = soup.get_text(separator=" ", strip=True)
        return " ".join(text.split())
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return ""

In [9]:
# Step 3: Chunk Text for Embedding
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 128) -> list:
    """Split text into overlapping chunks."""
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

In [10]:
# Step 4: Generate Embeddings and Sparse Weights (Using Your Provided Code)
def generate_embeddings(texts: list, batch_size: int = 32):
    """Generate embeddings and sparse weights using BGE-M3 and MPS."""
    embeddings = []
    
    # Generate embeddings
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=8192, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = embedding_model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()  # Get [CLS] token embeddings
            embeddings.extend(batch_embeddings)
    
    embeddings_array = np.array(embeddings)
    
    # Generate sparse weights using token frequencies
    sparse_weights = np.zeros((len(texts), tokenizer.vocab_size))
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating sparse weights"):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, max_length=8192, return_tensors="pt").to(device)
        
        # Convert to token frequencies (simple TF scoring)
        for j, text_tokens in enumerate(tokens['input_ids'].cpu().numpy()):
            unique_tokens, counts = np.unique(text_tokens, return_counts=True)
            sparse_weights[i + j][unique_tokens] = counts
    
    # Normalize sparse weights
    sparse_weights = sparse_weights / (sparse_weights.sum(axis=1, keepdims=True) + 1e-8)
    
    return embeddings_array, sparse_weights

In [11]:
# Step 5: Build and Save FAISS Index
def build_and_save_index(embeddings: np.ndarray, index_path: str):
    """Build a FAISS index and save it to a file."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    faiss.write_index(index, index_path)
    print(f"FAISS index saved to {index_path}")


In [30]:
def normalize_scores(scores: np.ndarray):
    """Normalize the scores to the range [0, 1]."""
    min_score = np.min(scores)
    max_score = np.max(scores)
    
    # Avoid division by zero if max_score equals min_score
    if max_score != min_score:
        return (scores - min_score) / (max_score - min_score)
    else:
        return scores  # Return as is if scores have no variance

In [33]:
def hybrid_search(query: str, index_path: str, sparse_weights: np.ndarray, chunks: list, k: int = 10):
    """Perform hybrid search with normalized dense and sparse scores."""
    # Load FAISS index
    index = faiss.read_index(index_path)

    # Query embedding
    inputs = tokenizer(query, return_tensors="pt", max_length=8192, truncation=True).to(device)
    with torch.no_grad():
        query_embedding = embedding_model(**inputs).last_hidden_state[:, 0].cpu().numpy()

    faiss.normalize_L2(query_embedding)
    dense_scores, dense_indices = index.search(query_embedding, k)

    # Normalize dense scores
    dense_scores = normalize_scores(dense_scores[0])

    # Sparse scores
    query_tokens = inputs["input_ids"][0].cpu().numpy()
    sparse_query = np.bincount(query_tokens, minlength=tokenizer.vocab_size)
    sparse_query = sparse_query / (sparse_query.sum() + 1e-8)
    sparse_scores = np.dot(sparse_weights, sparse_query)

    # Normalize sparse scores
    sparse_scores = normalize_scores(sparse_scores)

    # Combine scores (weighted sum)
    dense_weight = 0.7
    sparse_weight = 0.3
    
    combined_scores = {}
    for idx, (score, dense_idx) in enumerate(zip(dense_scores, dense_indices[0])):
        combined_scores[dense_idx] = dense_weight * score + sparse_weight * sparse_scores[dense_idx]

    # Sort by combined score
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return top k results with the corresponding text and URLs
    return [(chunks[idx], score, chunk_to_url[idx]) for idx, score in sorted_results[:k]]


In [13]:
# Step 7: Rerank Results
def rerank_results(query: str, results: list):
    """Rerank results using the cross-encoder."""
    pairs = [(query, result[0]) for result in results]
    inputs = rerank_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = rerank_model(**inputs).logits.squeeze().cpu().numpy()
    reranked = sorted([(results[i][0], logits[i]) for i in range(len(results))], key=lambda x: x[1], reverse=True)
    return reranked


In [14]:
# Step 8: Cosine Similarity to Check Relevance
def compute_similarity(query_embedding: np.ndarray, document_embeddings: np.ndarray) -> np.ndarray:
    """Compute cosine similarity between the query and document embeddings."""
    return cosine_similarity(query_embedding, document_embeddings)



In [15]:
def check_query_relevance(query: str, reranked_results: list, threshold: float = 0.5, k: int = 5):
    """Check the relevance of the query using cosine similarity."""
    
    # Step 1: Generate the embedding for the query
    query_inputs = tokenizer(query, return_tensors="pt", max_length=8192, truncation=True).to(device)
    with torch.no_grad():
        query_embedding = embedding_model(**query_inputs).last_hidden_state[:, 0]
        query_embedding = query_embedding.detach().cpu().numpy()  # Detach and convert to numpy
    
    # Step 2: Get the embeddings for the top k documents from reranked_results
    top_docs = [result[0] for result in reranked_results[:k]]
    top_doc_embeddings = np.array([
        embedding_model(**tokenizer(doc, return_tensors="pt", truncation=True, padding=True, max_length=8192).to(device)).last_hidden_state[:, 0]
        .detach().cpu().numpy()
        for doc in top_docs
    ])
    
    # Step 3: Ensure embeddings are 2D for cosine similarity
    query_embedding = query_embedding.reshape(1, -1)  # Reshape to (1, hidden_size)
    top_doc_embeddings = top_doc_embeddings.reshape(k, -1)  # Reshape to (k, hidden_size)

    # Step 4: Compute cosine similarity between the query and the top k documents
    similarities = compute_similarity(query_embedding, top_doc_embeddings)

    # Step 5: Check the highest similarity score
    max_similarity = max(similarities[0])  # Get the highest similarity score from the result
    print(f"Max similarity score: {max_similarity:.4f}")

    if max_similarity < threshold:
        print("Query is classified as irrelevant.")
        return "Sorry, your query seems unrelated to the documentation. Please refine your query."
    else:
        print("Query is classified as relevant.")
        return reranked_results  # Return the reranked results if relevant



In [17]:
sitemap_url = "https://nextjs.org/sitemap.xml"
urls = fetch_sitemap(sitemap_url)

In [19]:
chunks, chunk_to_url = [], {}
for url in tqdm(urls, desc="Processing URLs"):
    content = fetch_content(url)
    if content:
        url_chunks = chunk_text(content)
        chunk_to_url.update({len(chunks) + i: url for i in range(len(url_chunks))})
        chunks.extend(url_chunks)

Processing URLs:   1%|          | 6/568 [00:01<01:49,  5.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8676 > 8192). Running this sequence through the model will result in indexing errors
Processing URLs: 100%|██████████| 568/568 [02:27<00:00,  3.85it/s]


In [22]:
# Generate embeddings and sparse weights
embeddings, sparse_weights = generate_embeddings(chunks)

Generating embeddings: 100%|██████████| 77/77 [06:39<00:00,  5.19s/it]
Generating sparse weights: 100%|██████████| 77/77 [00:01<00:00, 59.48it/s]


In [24]:
print(f"Shape of sparse_weights: {sparse_weights.shape}")
print(f"First few rows of sparse_weights: {sparse_weights[:5]}")

Shape of sparse_weights: (2462, 250002)
First few rows of sparse_weights: [[0.00194175 0.00194175 0.00194175 ... 0.         0.         0.        ]
 [0.00194175 0.         0.00194175 ... 0.         0.         0.        ]
 [0.00194175 0.00194175 0.00194175 ... 0.         0.         0.        ]
 [0.00194175 0.25825243 0.00194175 ... 0.         0.         0.        ]
 [0.00194175 0.00194175 0.00194175 ... 0.         0.         0.        ]]


In [26]:
import scipy.sparse

# Convert the sparse_weights to a sparse matrix format
sparse_weights_sparse = scipy.sparse.csr_matrix(sparse_weights)

# Save the sparse matrix in .npz format
scipy.sparse.save_npz("sparse_weights.npz", sparse_weights_sparse)

print("Sparse weights saved successfully!")

Sparse weights saved successfully!


In [27]:
# Load FAISS index
index_path = "faiss_index.bin"
index = faiss.read_index(index_path)
print("FAISS index loaded successfully!")

# Load Sparse Weights from .npz file
sparse_weights_loaded = scipy.sparse.load_npz("sparse_weights.npz")
print("Sparse weights loaded successfully!")

# Convert sparse matrix to dense numpy array if needed for compatibility
# If you want to use sparse format, make sure your hybrid_search function is compatible with sparse matrices
sparse_weights_dense = sparse_weights_loaded.toarray()

FAISS index loaded successfully!
Sparse weights loaded successfully!


In [32]:
# Now you can use these loaded values in your pipeline
query = "What is next"  # An irrelevant query

# Call the hybrid_search function with the loaded FAISS index and sparse weights
results = hybrid_search(query, index_path, sparse_weights_dense, chunks)

# Perform reranking
reranked_results = rerank_results(query, results)

# Check relevance based on similarity
final_results = check_query_relevance(query, reranked_results)

# Display results or fallback message
if isinstance(final_results, str):
    print(final_results)  # Print fallback message if irrelevant
else:
    print("\nSearch Results:")
    for text, score in final_results:
        print(f"\nScore: {score:.4f}")
        print(f"Text: {text[:200]}...")

Max similarity score: 0.5519
Query is classified as relevant.

Search Results:

Score: -0.0056
Text: ly appreciate your feedback. If you are experiencing any issues with Next.js, please open an issue , or start a new discussion , and we will investigate....

Score: -0.0166
Text: , and other external system synchronization. Since these tasks are not directly related to the response, the user should not have to wait for them to complete. Deferring the work after responding to t...

Score: -0.0666
Text: green for performant applications. The following options are available for the next build command: Option Description -h, --help Show all available options. [directory] A directory on which to build t...

Score: -0.0666
Text: green for performant applications. The following options are available for the next build command: Option Description -h, --help Show all available options. [directory] A directory on which to build t...

Score: -0.0846
Text: to do next, we recommend the following sec