In [19]:
# === Step 0. Import & Force MPS ===
import warnings; warnings.filterwarnings("ignore")

In [20]:
import torch
from sentence_transformers import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [21]:
# Force usage of Apple's Metal Performance Shaders (MPS) on macOS.
# This will raise an error if MPS is not available (e.g., wrong PyTorch build or non-Apple device).
assert torch.backends.mps.is_available(), "❌ MPS not available! (Check macOS + PyTorch MPS support)"
device = "mps"  # Fix device to MPS
print(f"✅ Device fixed to: {device}")

✅ Device fixed to: mps


In [22]:
print("\n=== Step 1. Example Data ===")
query = "New York best pizza recommendation"
docs = [
    "New York pizza restaurant guide: Brooklyn, Manhattan highlights",
    "Chicago deep-dish pizza vs New York thin crust",
    "How to book a table at a famous pizza place in NYC",
    "Los Angeles vegan food tour",
    "Brooklyn hidden pizza gems for locals"
]
print(f"- Query: {query}")
for i, d in enumerate(docs):
    print(f"  [{i}] {d}")


=== Step 1. Example Data ===
- Query: New York best pizza recommendation
  [0] New York pizza restaurant guide: Brooklyn, Manhattan highlights
  [1] Chicago deep-dish pizza vs New York thin crust
  [2] How to book a table at a famous pizza place in NYC
  [3] Los Angeles vegan food tour
  [4] Brooklyn hidden pizza gems for locals


In [23]:
print("\n=== Step 2.1: TF-IDF transformation & similarity calculation ===")

# Create a TF-IDF vectorizer using unigrams and bigrams
tfv = TfidfVectorizer(ngram_range=(1,2), min_df=1)

# Fit the vectorizer on both the documents and the query, then transform
X = tfv.fit_transform(docs + [query])

# Separate document vectors and the query vector
X_docs, X_q = X[:-1], X[-1]

# Compute cosine similarity between each document and the query
sims = cosine_similarity(X_docs, X_q)[:, 0]
print("Similarity scores:", sims)



=== Step 2.1: TF-IDF transformation & similarity calculation ===
Similarity scores: [0.18674964 0.17132846 0.0237282  0.         0.03217088]


In [25]:
print("\n=== Step 2.2: Select top-k candidates ===")

# Select top-k documents with the highest similarity scores
k = 3
topk_idx = np.argsort(-sims)[:k].tolist()

# Store candidates as (index, document, similarity score)
candidates = [(i, docs[i], float(sims[i])) for i in topk_idx]

# Print candidates with their TF-IDF similarity scores
for i, d, s in candidates:
    print(f"  idx={i} | tfidf_sim={s:.4f} | {d}")



=== Step 2.2: Select top-k candidates ===
  idx=0 | tfidf_sim=0.1867 | New York pizza restaurant guide: Brooklyn, Manhattan highlights
  idx=1 | tfidf_sim=0.1713 | Chicago deep-dish pizza vs New York thin crust
  idx=4 | tfidf_sim=0.0322 | Brooklyn hidden pizza gems for locals


In [26]:
print("\n=== Step 3. Reranker (Cross-Encoder, MPS) ===")

# Define the Cross-Encoder model name (a lightweight reranker model)
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Print a message showing which model and device are being used
print(f"- Loading model: {model_name} (device={device})")

# Load the Cross-Encoder model onto the specified device (MPS here)
reranker = CrossEncoder(model_name, device=device)

# Confirm that the reranker is ready
print("✅ Reranker loaded successfully")



=== Step 3. Reranker (Cross-Encoder, MPS) ===
- Loading model: cross-encoder/ms-marco-MiniLM-L-6-v2 (device=mps)
✅ Reranker loaded successfully


In [27]:
print("\n=== Step 4. Reranking Execution ===")

# Create query-document pairs for the reranker
pairs = [(query, d) for (_, d, _) in candidates]

# Use the Cross-Encoder reranker to predict relevance scores for each pair
scores = reranker.predict(pairs)

# Combine candidates with their TF-IDF score and rerank score
# Sort by rerank score in descending order (highest relevance first)
reranked = sorted(
    [(i, d, tfidf_s, float(s)) for (i, d, tfidf_s), s in zip(candidates, scores)],
    key=lambda x: -x[3]
)

# Print the reranked candidates with both TF-IDF and rerank scores
for i, d, tfidf_s, rr_s in reranked:
    print(f"  idx={i} | tfidf={tfidf_s:.4f} | rerank={rr_s:.4f} | {d}")

# Select the top-ranked document as the final result
best_doc = reranked[0][1]
print("\n✅ Final selected document:", best_doc)



=== Step 4. Reranking Execution ===
  idx=0 | tfidf=0.1867 | rerank=1.1501 | New York pizza restaurant guide: Brooklyn, Manhattan highlights
  idx=1 | tfidf=0.1713 | rerank=-1.2569 | Chicago deep-dish pizza vs New York thin crust
  idx=4 | tfidf=0.0322 | rerank=-3.9426 | Brooklyn hidden pizza gems for locals

✅ Final selected document: New York pizza restaurant guide: Brooklyn, Manhattan highlights
