In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Suha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Suha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
"""
This notebook builds hybrid retrieval components for the SHL recommender:
- Dense embeddings (SentenceTransformer + FAISS)
- Sparse index (BM25)
- Hybrid scoring function

Output:
 - data/embeddings.faiss
 - data/meta_catalog.json
 - data/bm25.pkl (optional)
"""

import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk
import json
import pickle

nltk.download('punkt')

# ----------------------------
# Load catalog
# ----------------------------
CATALOG_PATH = '../data/catalog_clean.csv'
df = pd.read_csv(CATALOG_PATH)
print(f"Loaded {len(df)} assessments from {CATALOG_PATH}")

# ----------------------------
# Build Dense Embeddings (FAISS)
# ----------------------------
print("Creating sentence embeddings (MiniLM-L6-v2)...")
model = SentenceTransformer('all-MiniLM-L6-v2')

texts = df['description'].fillna('').tolist()
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Create FAISS index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Save FAISS index and metadata
faiss.write_index(index, '../data/embeddings.faiss')
df.to_json('../data/meta_catalog.json', orient='records', lines=True)
print("FAISS index and metadata saved.")

# ----------------------------
# Build Sparse Index (BM25)
# ----------------------------
print("Building BM25 index...")
tokenized = [nltk.word_tokenize(t.lower()) for t in texts]
bm25 = BM25Okapi(tokenized)

with open('../data/bm25.pkl', 'wb') as f:
    pickle.dump(bm25, f)
print("BM25 index saved.")

# ----------------------------
# Define Hybrid Search Function
# ----------------------------

def hybrid_search(query, top_k=10, w_dense=0.55, w_sparse=0.45):
    """
    Performs a hybrid search combining BM25 (lexical) and FAISS (semantic) scores.
    Returns a DataFrame with the top results.
    """
    # Sparse (BM25)
    q_tokens = nltk.word_tokenize(query.lower())
    sparse_scores = np.array(bm25.get_scores(q_tokens))

    # Dense (FAISS)
    q_emb = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    dense_scores, _ = index.search(q_emb, len(df))
    dense_scores = dense_scores.flatten()

    # Combine
    final_scores = w_dense * dense_scores + w_sparse * sparse_scores
    top_ids = np.argsort(final_scores)[::-1][:top_k]
    return df.iloc[top_ids][['name', 'url', 'description', 'test_type']]

# Example test
print("\nExample query test:")
example = "Hiring for a Java developer who can collaborate with teams"
results = hybrid_search(example, top_k=5)
print(results[['name', 'test_type']])

# Optional: Save a preview
results.to_csv('../data/sample_results.csv', index=False)
print("Hybrid search test completed. Sample results saved.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Suha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded 24 assessments from ../data/catalog_clean.csv
Creating sentence embeddings (MiniLM-L6-v2)...


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.65it/s]


FAISS index and metadata saved.
Building BM25 index...
BM25 index saved.

Example query test:
                                    name test_type
23                         ADO.NET (New)   Unknown
22  Accounts Receivable Simulation (New)   Unknown
21             Accounts Receivable (New)   Unknown
20     Accounts Payable Simulation (New)   Unknown
19                Accounts Payable (New)   Unknown
Hybrid search test completed. Sample results saved.
