# Imports

In [122]:
import numpy as np
import json
import bm25s
import gensim.downloader as dl
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm

# Reading and parsing the files

In [123]:
def load_jsonl(filepath):
    items = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            items.append(item)
    return items

docs = load_jsonl("docs.jsonl")      # List of dicts with keys ["doc_id", "text"]
queries = load_jsonl("queries.jsonl")  # List of dicts with (e.g.) ["query_id", "query_text", "doc_id"]



#  Lexical Indexing with BM25 (sparse)

In [189]:
# Prepare the corpus for BM25
corpus = [d["text"] for d in docs]

# Initialize BM25 index
index = bm25s.BM25()

# 1) Tokenize the corpus and retrieve both tokens & dictionary
corpus_tokens, dictionary = bm25s.tokenize(corpus)

# 2) Build the index using these tokens
index.index(corpus_tokens)


                                                                            

## Retrieve top-k documents for a query

In [205]:
def bm25_retrieve(query, k=5):
    # Tokenize the query and map tokens using the corpus dictionary
    q_tokens = query.split()  # Basic tokenization
    ids = [dictionary[t] for t in q_tokens if t in dictionary]  # Filter OOV words

    if not ids:
        print(f"No valid tokens found for query: {query}")
        return []


    # Retrieve top-k results using BM25 index
    results = index.retrieve(query_tokens=[ids], k=k)  # No need to pass `corpus` here

    # Extract top-k document indices and scores
    doc_indices = results.documents[0]  # First query's results
    scores = results.scores[0]  # First query's scores


    return [docs[doc_idx]['doc_id'] for doc_idx in doc_indices]



for query in queries[:5]:  # Check the first few queries
    bm25_retrieve(query["query"], k=5)



                                                     

In [70]:
with open('docs.jsonl') as f:
    docs = [json.loads(line) for line in f]
with open('queries.jsonl') as f:
    queries = [json.loads(line) for line in f]

# Creating Dense Indices

In [217]:
# Load the pre-trained word2vec model
model = dl.load("word2vec-google-news-300")

In [218]:
import bm25s

text = "This is an example sentence for GloVe encoding."

# Tokenizing with bm25s
tokenized = bm25s.tokenize([text])
print(tokenized)


                                                    

Tokenized(ids=[[0, 1, 2, 3]], vocab={'example': 0, 'sentence': 1, 'glove': 2, 'encoding': 3})




## Combine static word vectors

In [219]:


doc_ids_static = []
vecs_static_list = []  # will collect vectors as python lists

# Process each document
for d in docs:
    text = d["text"]
    tokens = bm25s.tokenize(text).vocab.keys()
    vecs = []
    
    for token in tokens:
        try:
            # Get the GloVe vector for each token
            vec = model[token]
            vecs.append(vec)
        except KeyError:
            # Skip tokens not in GloVe's vocabulary
            pass
    
    if vecs:
        # Average the vectors if valid vectors are found
        vecs_static_list.append(np.mean(vecs, axis=0))
        doc_ids_static.append(d["doc_id"])
    else:
        # Fallback to a zero vector if no valid tokens
        vecs_static_list.append(np.zeros(200))  # Adjusted to 200 dimensions
        doc_ids_static.append(d["doc_id"])

# Convert the list of vectors into a numpy array
vecs_static = np.array(vecs_static_list)  # shape = (len(docs), 200)

print("Embedding creation complete!")

                                                    

Embedding creation complete!




In [220]:
vecs_static.shape

(5000, 300)

In [221]:
def retrieve_dense_static(query_text, k=10):
    tokens = bm25s.tokenize(query_text).vocab.keys()
    vecs = []
    
    for token in tokens:
        try:
            # Get the GloVe vector for each token
            vec = model[token]
            vecs.append(vec)
        except KeyError:
            # Skip tokens not in GloVe's vocabulary
            pass
    query_vec = np.mean(vecs, axis=0) if vecs else np.zeros(200)
    vecs_norm = np.linalg.norm(vecs_static, axis=1)
    
    # compute cosine similarity
    scores = np.dot(vecs_static, query_vec) / (vecs_norm * np.linalg.norm(query_vec))
    topk_doc_ids = np.argsort(scores)[::-1][:k]
    return [doc_ids_static[i] for i in topk_doc_ids]




## Combine contextual word-vectors

In [222]:


bert_model_name = "roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)
bert_model.eval()
bert_model.cuda()  # if you have GPU; else remove

doc_ids_bert = []
bert_vecs_list = []

for d in tqdm(docs):
    text = d["text"]
    encoding = bert_tokenizer(
        text,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].cuda()  # if GPU
    attention_mask = encoding["attention_mask"].cuda() 
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        # outputs.last_hidden_state shape = (batch_size, seq_len, hidden_dim=768)
        hidden_states = outputs.last_hidden_state[0]  # shape (seq_len, 768)
        avg_vec = hidden_states.mean(dim=0)           # shape (768,)
    avg_vec_np = avg_vec.cpu().numpy()
    doc_ids_bert.append(d["doc_id"])
    bert_vecs_list.append(avg_vec_np)

bert_vecs = np.array(bert_vecs_list)  # shape = (5000, 768)
print("BERT embedding creation complete!")
print(bert_vecs.shape)

np.save(open("bert_doc_ids.npy", "wb"), np.array(doc_ids_bert))
np.save(open("bert_vecs.npy", "wb"), bert_vecs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5000/5000 [02:06<00:00, 39.48it/s]

BERT embedding creation complete!
(5000, 768)





In [223]:
def retrieve_bert(query_text, k=10):
    encoding = bert_tokenizer(
        query_text, truncation=True, max_length=512, return_tensors="pt"
    )
    input_ids = encoding["input_ids"].cuda()
    attention_mask = encoding["attention_mask"].cuda()
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state[0]
        query_vec = hidden_states.mean(dim=0)
    query_vec_np = query_vec.cpu().numpy()
    
    # compute cos similarities with bert_vecs
    dot_scores = bert_vecs @ query_vec_np
    norm_docs = np.linalg.norm(bert_vecs, axis=1)
    norm_query = np.linalg.norm(query_vec_np)
    scores = dot_scores / (norm_docs * norm_query + 1e-8)
    
    topk_indices = np.argsort(-scores)[:k]
    topk_doc_ids = [doc_ids_bert[i] for i in topk_indices]
    return topk_doc_ids


## Use a pre-trained text embedder

In [40]:
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/multi-qa-MiniLM-L6-dot-v1"
st_model = SentenceTransformer(model_name)

doc_ids_st = []
st_vecs_list = []

for d in docs:
    text = d["text"]
    # encode method handles tokenization internally
    emb = st_model.encode(text, normalize_embeddings=False)  # shape: (384,) for that model
    doc_ids_st.append(d["doc_id"])
    st_vecs_list.append(emb)

st_vecs = np.array(st_vecs_list)  # shape = (5000, 384)

np.save(open("st_doc_ids.npy", "wb"), np.array(doc_ids_st))
np.save(open("st_vecs.npy", "wb"), st_vecs)


In [213]:
def retrieve_st(query_text, k=10):
    q_emb = st_model.encode(query_text, normalize_embeddings=False)
    dot_scores = st_vecs @ q_emb
    norm_docs = np.linalg.norm(st_vecs, axis=1)
    norm_query = np.linalg.norm(q_emb)
    scores = dot_scores / (norm_docs * norm_query + 1e-8)
    
    topk_indices = np.argsort(-dot_scores)[:k]
    topk_doc_ids = [doc_ids_st[i] for i in topk_indices]
    return topk_doc_ids


# Evaluations

## Evaluation funcitons

In [106]:
def recall_at_k(index_retrieval_fn, queries, k=20):
    hits = 0
    for q in queries:
        relevant_id = q["doc_id"]
        topk = index_retrieval_fn(q["query"], k=k)
        if relevant_id in topk:
            hits += 1
    return hits / len(queries)


In [109]:
def mrr(index_retrieval_fn, queries, k=100):
    s = 0.0
    for q in queries:
        relevant_id = q["doc_id"]
        topk = index_retrieval_fn(q["query"], k=k)
        # find rank
        rr = 0.0
        for rank, doc_id in enumerate(topk, start=1):
            if doc_id == relevant_id:
                rr = 1.0 / rank
                break
        s += rr
    return s / len(queries)


## Evaluation code

In [224]:
first_query = queries[0]["query"]
bm25_top = bm25_retrieve(first_query, k=10)
static_top = retrieve_dense_static(first_query, k=10)
bert_top = retrieve_bert(first_query, k=10)
st_top = retrieve_st(first_query, k=10)

with open("q1.txt", "w") as f:
    f.write(" ".join(bm25_top) + "\n")
    f.write(" ".join(static_top) + "\n")
    f.write(" ".join(bert_top) + "\n")
    f.write(" ".join(st_top) + "\n")


                                                     

In [225]:
# BM25
r_bm25 = recall_at_k(bm25_retrieve, queries, k=20)
m_bm25 = mrr(bm25_retrieve, queries, k=20)

# Static
r_static = recall_at_k(retrieve_dense_static, queries, k=20)
m_static = mrr(retrieve_dense_static, queries, k=20)

# BERT
r_bert = recall_at_k(retrieve_bert, queries, k=20)
m_bert = mrr(retrieve_bert, queries, k=20)

# Sentence-Transformers
r_st = recall_at_k(retrieve_st, queries, k=20)
m_st = mrr(retrieve_st, queries, k=20)

with open("scores.txt", "w") as f:
    f.write(f"{r_bm25} {m_bm25}\n")
    f.write(f"{r_static} {m_static}\n")
    f.write(f"{r_bert} {m_bert}\n")
    f.write(f"{r_st} {m_st}\n")


                                                     