# Eval: Sentence Transformer with question-result set

This notebook outlines the steps undertook to evaluate various other models out of the box with the test set under a QA set-up.

With 8192 chunk size.

# Import libraries

In [2]:
#%pip install nltk -q
#%pip install sentence_transformers -q

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer, util

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Global variable

In [3]:
EVALS = {}
MAX_WORDS = 8192

# Helper functions

In [4]:
def split_text_into_chunks(text, tag="", max_words=MAX_WORDS, overlap_sentences=3):
    if tag == "query":
      text = "search_query: " + text
    elif tag == "opinion":
      text = "search_document: " + text
    else:
      text = text

    sentences = sent_tokenize(text)  # Tokenize into sentences
    chunks = []
    start_idx = 0  # Start index of the current chunk

    while start_idx < len(sentences):
        current_chunk = []
        current_word_count = 0
        idx = start_idx

        # Build a chunk until max_words is reached
        while idx < len(sentences) and current_word_count + len(sentences[idx].split()) <= max_words:
            current_chunk.append(sentences[idx])
            current_word_count += len(sentences[idx].split())
            idx += 1  # Move to next sentence

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        # Stop if the last chunk reaches the end of the text
        if idx >= len(sentences):
            break

        # Move start index forward but keep overlap
        start_idx = max(idx - overlap_sentences, start_idx + 1)

    return chunks


def create_embeddings(df, model, text_column, embedding_column, tag):
    df[embedding_column] = None

    for idx, text in enumerate(df[text_column]):
        chunks = split_text_into_chunks(text, tag)  # Chunk the text
        chunk_embeddings = np.array(model.encode(chunks))  # Encode all chunks (shape: [num_chunks, embedding_dim])
        df.at[idx, embedding_column] = chunk_embeddings

    return df


def evaluate_retrieval(query_emb, doc_embs, relevant_idx, top_k=5):
    # Flatten the document chunks and keep track of the document they belong to
    all_chunk_embeddings = []
    doc_chunk_mapping = []  # Mapping from chunk index to document index
    for doc_idx, doc_chunks in enumerate(doc_embs):
        for chunk_emb in doc_chunks:
            all_chunk_embeddings.append(chunk_emb)
            doc_chunk_mapping.append(doc_idx)  # Store the document index for each chunk

    # Convert to numpy array for efficient computation
    all_chunk_embeddings = np.array(all_chunk_embeddings)  # Shape: [total_chunks, 768]

    # Compute cosine similarity between query and all document chunks
    similarities = np.dot(all_chunk_embeddings, query_emb.T)  # Shape: [total_chunks,]

    # Get indices of top-k most similar chunks
    top_k_chunk_indices = np.argsort(similarities.flatten())[::-1][:top_k]

    # Find the documents corresponding to the top-k chunks
    retrieved_docs = [doc_chunk_mapping[idx] for idx in top_k_chunk_indices]

    # Count the number of times the relevant document appears in the top-k retrieved documents
    relevant_document_hits = sum(1 for doc_idx in retrieved_docs if doc_idx == relevant_idx)

    # Compute rank (based on first appearance of the relevant document)
    rank = None
    for idx, doc_idx in enumerate(retrieved_docs):
        if doc_idx == relevant_idx:
            rank = idx + 1
            break

    # Compute metrics
    is_hit = relevant_document_hits > 0  # Hit if any of the top-k chunks belong to the relevant document
    mrr = 0 if rank is None else 1 / rank

    return {
        "is_hit": is_hit,
        "mrr": mrr,
        "retrieved": retrieved_docs,
        "expected": relevant_idx
    }


def evaluate_model(df, model, query_column, query_emb_column, opinion_column, opinion_emb_column, tags=["",""], top_k=5):
    # Step 1: Create embeddings for queries and opinions
    df = create_embeddings(df, model, query_column, query_emb_column, tag=tags[0])
    df = create_embeddings(df, model, opinion_column, opinion_emb_column, tag=tags[1])

    # Step 2: Evaluate retrieval for each query
    results_dict = {}
    for idx, query_row in df.iterrows():
        opinion_id = query_row["opinion_id"]
        query_emb = query_row[query_emb_column]
        doc_embs = df[opinion_emb_column]
        results_dict[opinion_id] = evaluate_retrieval(query_emb, doc_embs, idx, top_k=top_k)

    # Step 3: Convert results to DataFrame
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index': 'opinion_id'})

    # Step 4: Merge results with the original DataFrame
    final_df = df.merge(results_df, how="left", on="opinion_id")

    # Step 5: Calculate evaluation metrics (Hit Rate and MRR)
    model_name = model.name
    EVALS[model_name] = {}
    EVALS[model_name]["hit_rate"] = final_df["is_hit"].mean()
    EVALS[model_name]["mrr"] = final_df["mrr"].mean()

    return final_df

# Load the data

In [5]:
df = pd.read_csv("outputs/3.test.csv")
df = df[["opinion_id", "opinion_word_count", "opinion", "relevant_query_qstn"]]
df.head()

Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...


# Evaluate model: Alibaba-NLP/gte-large-en-v1.5

In [6]:
%%time

model_name = "Alibaba-NLP/gte-large-en-v1.5"
model = SentenceTransformer(model_name, trust_remote_code=True)
model.name = model_name
result1 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings")

result1.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

CPU times: user 6min 55s, sys: 4.97 s, total: 7min
Wall time: 7min 58s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.40575194, 0.38242242, -1.2533504, -0.0408...","[[-0.3680863, 0.31216347, -0.02005817, 0.27055...",True,1.0,"[0, 381, 55, 303, 266]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.0030804283, -1.2400904, 0.0840652, -0.061...","[[0.5016187, 0.17687476, -1.1540267, -0.510508...",True,1.0,"[1, 397, 349, 109, 398]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.1717992, 0.04643109, -0.43701205, 0.38527...","[[0.31887192, 0.059133492, 0.92121917, 0.38830...",True,0.2,"[425, 382, 32, 367, 2]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[-0.5754458, 0.287126, -0.71448034, -0.323354...","[[-0.22347826, -0.09282645, 0.70723313, 0.3846...",True,1.0,"[3, 16, 174, 382, 124]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.58821315, -0.092121914, -0.43259895, 0.487...","[[0.6029626, 0.3728198, -0.27950096, 0.2011179...",True,1.0,"[4, 90, 147, 124, 57]",4


In [7]:
result1.to_csv("4b.eval/QA-8192/3.result1.csv", index=False)
len(result1)

450

In [8]:
EVALS

{'Alibaba-NLP/gte-large-en-v1.5': {'hit_rate': 0.8555555555555555,
  'mrr': 0.7822592592592592}}

# Evaluate model: thenlper/gte-large

In [9]:
#model_name = "thenlper/gte-large"
#model = SentenceTransformer(model_name)
#model.name = model_name
#result2 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings")

#result2.head()

In [10]:
#result2.to_csv("4b.eval/QA-8192/3.result2.csv", index=False)
#len(result2)

In [11]:
#EVALS

# Evaluate model: nomic-ai/nomic-embed-text-v1

In [12]:
%%time

model_name = "nomic-ai/nomic-embed-text-v1"
model = SentenceTransformer(model_name, trust_remote_code=True)
model.name = model_name
result3 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", ["query", "opinion"])

result3.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/70.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/95.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/547M [00:00<?, ?B/s]

  state_dict = loader(resolved_archive_file)


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

CPU times: user 2min 34s, sys: 2.6 s, total: 2min 36s
Wall time: 2min 56s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[0.005788066, 0.013440334, -0.030131439, -0.0...","[[0.038095787, -0.0008105945, -0.017515069, -0...",True,1.0,"[0, 381, 51, 303, 410]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.03487912, -0.004870002, -0.02590829, -0.01...","[[0.05366274, 0.027377544, 0.0049902108, -0.01...",True,1.0,"[1, 209, 257, 287, 83]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.015698612, 0.04483484, -0.026227893, -0.08...","[[0.022723714, 0.007824049, -0.006382495, -0.0...",False,0.0,"[334, 380, 382, 283, 196]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.03051981, 0.05726627, -0.023630742, -0.039...","[[0.0032444783, 0.0002700611, -0.01697724, -0....",True,1.0,"[3, 142, 257, 63, 287]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.018545283, 0.074051715, -0.012522544, -0.0...","[[0.03310299, 0.03721276, 0.0036807582, -0.031...",True,1.0,"[4, 201, 122, 62, 13]",4


In [13]:
result3.to_csv("4b.eval/QA-8192/3.result3.csv", index=False)
len(result3)

450

In [14]:
EVALS

{'Alibaba-NLP/gte-large-en-v1.5': {'hit_rate': 0.8555555555555555,
  'mrr': 0.7822592592592592},
 'nomic-ai/nomic-embed-text-v1': {'hit_rate': 0.8288888888888889,
  'mrr': 0.7622962962962962}}

# Evaluate model: BAAI/bge-m3

In [15]:
%%time

model_name = 'BAAI/bge-m3'
model = SentenceTransformer('BAAI/bge-m3', trust_remote_code=True)
model.name = model_name
result4 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings")

result4.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

CPU times: user 6min 21s, sys: 7.63 s, total: 6min 28s
Wall time: 7min 27s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.059338793, 0.014934848, -0.00617138, -0.0...","[[-0.038305905, 0.032566477, -0.026009643, -0....",True,1.0,"[0, 381, 214, 205, 448]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.037957225, -0.019329162, -0.011700455, 0....","[[-0.05405041, 0.023899905, -0.04524074, -0.00...",True,1.0,"[1, 301, 226, 15, 195]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.0322296, 0.026047228, -0.010783682, -0.02...","[[-0.048325725, 0.011996219, -0.028630974, -0....",False,0.0,"[301, 245, 243, 387, 165]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[-0.024493296, -0.042742852, -0.0035741676, 0...","[[-0.05142785, -0.0016678479, -0.016272588, -0...",True,1.0,"[3, 85, 82, 216, 330]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.036776025, -0.036311258, 0.006017427, 0.0...","[[-0.06273084, 0.0104499925, -0.042842526, 0.0...",True,1.0,"[4, 268, 205, 220, 195]",4


In [16]:
result4.to_csv("4b.eval/QA-8192/3.result4.csv", index=False)
len(result4)

450

In [17]:
EVALS

{'Alibaba-NLP/gte-large-en-v1.5': {'hit_rate': 0.8555555555555555,
  'mrr': 0.7822592592592592},
 'nomic-ai/nomic-embed-text-v1': {'hit_rate': 0.8288888888888889,
  'mrr': 0.7622962962962962},
 'BAAI/bge-m3': {'hit_rate': 0.7911111111111111, 'mrr': 0.7111111111111111}}