# Search API

## Import dependencies & config

In [12]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

INDEX_PATH = "qa_faiss_index_trans.index"
IDMAP_PATH = "id_mapping.json"
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"

## Define Retriever class

In [14]:
class Retriever:
    def __init__(self, index_path=INDEX_PATH, id_map_path=IDMAP_PATH, model_name=MODEL_NAME):
        """
        Initialize Retriever:
        - Load the SentenceTransformer model
        - Load the FAISS index
        - Load the id_map (index ID -> original entry mapping)
        """
        self.model = SentenceTransformer(model_name)
        self.index = faiss.read_index(index_path)
        with open(id_map_path, "r", encoding="utf-8") as f:
            self.id_map = json.load(f)
        self.dim = self.index.d

    def _encode_query(self, query: str) -> np.ndarray:
        """
        Encode the query into a vector representation.
        - Use the same model as used to build the index
        - Normalize embeddings so that cosine similarity works properly
        """
        vec = self.model.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
        return vec

    def search(self, query: str, k: int = 5):
        """
        Search the most similar Top-K entries for the input query.
        Steps:
        - Encode query
        - Run FAISS search
        - Map back the top results using id_map
        Returns:
        - List of dictionaries with rank, score, question, answer, source, link
        """
        qvec = self._encode_query(query)
        D, I = self.index.search(qvec, k)  # D = distances/scores, I = indices
        results = []
        for rank, (idx, score) in enumerate(zip(I[0], D[0]), start=1):
            item = self.id_map.get(str(idx), {})
            results.append({
                "rank": rank,
                "score": float(score),
                "question": item.get("question", ""),
                "answer": item.get("answer", ""),
                "source": item.get("source", ""),
                "link": item.get("link", "")
            })
        return results


## Test case

In [18]:
retriever = Retriever()

query = "墨尔本七夕节去哪里玩"
results = retriever.search(query, k=3)

for r in results:
    print(f"[{r['rank']}] score={r['score']:.4f}")
    print("A:", r['answer'])
    print("Source:", r['source'])
    print("Link:", r['link'])
    print("-" * 50)

[1] score=6.3013
A: 墨尔本密室逃脱
Source: Tripadvisor
Link: httpscn.tripadvisor.comAttractions-g255100-Activities-c56-t208-Melbourne_Victoria.html
--------------------------------------------------
[2] score=6.7289
A: 
Source: 搜狐
Link: httpswww.sohu.coma816084395_121124027
--------------------------------------------------
[3] score=6.9468
A: 墨尔本射击场
Source: 亿忆
Link: httpswww.yeeyi.comnewsdetails306347
--------------------------------------------------
