# Eval: ModernBert with question-result set

This notebook outlines the steps undertook to evaluate pretrained ModernBert out of the box with the test set under a QA set-up.

With 512 chunk size.

# Import libraries

In [2]:
#%pip install nltk -q
#%pip install sentence_transformers -q
%pip install git+https://github.com/huggingface/transformers.git -q

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Global variable

In [3]:
EVALS = {}
MAX_WORDS = 512

# Helper functions

In [4]:
def split_text_into_chunks(text, tag="", max_words=MAX_WORDS, overlap_sentences=3):
    if tag == "query":
      text = "search_query: " + text
    elif tag == "opinion":
      text = "search_document: " + text
    else:
      text = text

    sentences = sent_tokenize(text)  # Tokenize into sentences
    chunks = []
    start_idx = 0  # Start index of the current chunk

    while start_idx < len(sentences):
        current_chunk = []
        current_word_count = 0
        idx = start_idx

        # Build a chunk until max_words is reached
        while idx < len(sentences) and current_word_count + len(sentences[idx].split()) <= max_words:
            current_chunk.append(sentences[idx])
            current_word_count += len(sentences[idx].split())
            idx += 1  # Move to next sentence

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        # Stop if the last chunk reaches the end of the text
        if idx >= len(sentences):
            break

        # Move start index forward but keep overlap
        start_idx = max(idx - overlap_sentences, start_idx + 1)

    return chunks


def create_embeddings(df, model, text_column, embedding_column, tag):
    df[embedding_column] = None

    for idx, text in enumerate(df[text_column]):
        chunks = split_text_into_chunks(text, tag)  # Chunk the text
        chunk_embeddings = np.array(model.encode(chunks))  # Encode all chunks (shape: [num_chunks, embedding_dim])
        df.at[idx, embedding_column] = chunk_embeddings

    return df


def evaluate_retrieval(query_emb, doc_embs, relevant_idx, top_k=5):
    # Flatten the document chunks and keep track of the document they belong to
    all_chunk_embeddings = []
    doc_chunk_mapping = []  # Mapping from chunk index to document index
    for doc_idx, doc_chunks in enumerate(doc_embs):
        for chunk_emb in doc_chunks:
            all_chunk_embeddings.append(chunk_emb)
            doc_chunk_mapping.append(doc_idx)  # Store the document index for each chunk

    # Convert to numpy array for efficient computation
    all_chunk_embeddings = np.array(all_chunk_embeddings)  # Shape: [total_chunks, 768]

    # Compute cosine similarity between query and all document chunks
    similarities = np.dot(all_chunk_embeddings, query_emb.T)  # Shape: [total_chunks,]

    # Get indices of top-k most similar chunks
    top_k_chunk_indices = np.argsort(similarities.flatten())[::-1][:top_k]

    # Find the documents corresponding to the top-k chunks
    retrieved_docs = [doc_chunk_mapping[idx] for idx in top_k_chunk_indices]

    # Count the number of times the relevant document appears in the top-k retrieved documents
    relevant_document_hits = sum(1 for doc_idx in retrieved_docs if doc_idx == relevant_idx)

    # Compute rank (based on first appearance of the relevant document)
    rank = None
    for idx, doc_idx in enumerate(retrieved_docs):
        if doc_idx == relevant_idx:
            rank = idx + 1
            break

    # Compute metrics
    is_hit = relevant_document_hits > 0  # Hit if any of the top-k chunks belong to the relevant document
    mrr = 0 if rank is None else 1 / rank

    return {
        "is_hit": is_hit,
        "mrr": mrr,
        "retrieved": retrieved_docs,
        "expected": relevant_idx
    }


def evaluate_model(df, model, query_column, query_emb_column, opinion_column, opinion_emb_column, tags=["",""], top_k=5):
    # Step 1: Create embeddings for queries and opinions
    df = create_embeddings(df, model, query_column, query_emb_column, tag=tags[0])
    df = create_embeddings(df, model, opinion_column, opinion_emb_column, tag=tags[1])

    # Step 2: Evaluate retrieval for each query
    results_dict = {}
    for idx, query_row in df.iterrows():
        opinion_id = query_row["opinion_id"]
        query_emb = query_row[query_emb_column]
        doc_embs = df[opinion_emb_column]
        results_dict[opinion_id] = evaluate_retrieval(query_emb, doc_embs, idx, top_k=top_k)

    # Step 3: Convert results to DataFrame
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index': 'opinion_id'})

    # Step 4: Merge results with the original DataFrame
    final_df = df.merge(results_df, how="left", on="opinion_id")

    # Step 5: Calculate evaluation metrics (Hit Rate and MRR)
    model_name = model.name
    EVALS[model_name] = {}
    EVALS[model_name]["hit_rate"] = final_df["is_hit"].mean()
    EVALS[model_name]["mrr"] = final_df["mrr"].mean()

    return final_df

# Load the data

In [5]:
df = pd.read_csv("outputs/3.test.csv")
df = df[["opinion_id", "opinion_word_count", "opinion", "relevant_query_stmt", "relevant_query_qstn"]]
df.head()

Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_stmt,relevant_query_qstn
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",contractual obligations in maritime law,What are the contract terms determined by the ...
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,negligent retention employee sexual harassment,What are the legal requirements for a claim of...
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,ineffective assistance of appellate counsel in...,What factors are considered in determining ine...
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,Roger Fincher's appeal of order denying rule 3...,What were the grounds for Roger Fincher's appe...
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,Appeal from a conviction for aggravated sexual...,What criteria did the Idaho Supreme Court use ...


# Evaluate model: Alibaba-NLP/gte-modernbert-base

In [6]:
%%time

model_name = "Alibaba-NLP/gte-modernbert-base"
model = SentenceTransformer(model_name)
model.name = model_name
result1 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings")

result1.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

CPU times: user 2min 27s, sys: 3.89 s, total: 2min 31s
Wall time: 2min 55s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_stmt,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",contractual obligations in maritime law,What are the contract terms determined by the ...,"[[-0.5282118, 1.1907675, -2.2142599, -0.751800...","[[1.5250404, 0.8189017, -2.280198, -0.78443587...",True,1.0,"[0, 320, 233, 381, 231]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,negligent retention employee sexual harassment,What are the legal requirements for a claim of...,"[[1.1376549, -0.9511572, -1.1953472, 0.2055568...","[[0.4528701, -0.09215038, -2.8658695, -1.08390...",True,1.0,"[1, 1, 15, 1, 195]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,ineffective assistance of appellate counsel in...,What factors are considered in determining ine...,"[[1.0110979, 0.48207346, -1.5229329, -2.011673...","[[0.45347676, 0.7220209, 0.84967667, -1.475598...",False,0.0,"[165, 165, 124, 425, 427]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,Roger Fincher's appeal of order denying rule 3...,What were the grounds for Roger Fincher's appe...,"[[0.5669617, 0.791284, -1.6313955, -0.41851392...","[[2.082367, 1.8020911, -0.21239999, 0.03750479...",True,1.0,"[3, 296, 324, 315, 259]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,Appeal from a conviction for aggravated sexual...,What criteria did the Idaho Supreme Court use ...,"[[-0.0321525, 0.46916544, -0.8150225, -0.96585...","[[-0.31592846, 1.4662502, -0.5606873, 0.115878...",True,1.0,"[4, 4, 4, 4, 4]",4


In [7]:
result1.to_csv("4c.eval/QA-512/3.result1.csv", index=False)
len(result1)

450

In [8]:
EVALS

{'Alibaba-NLP/gte-modernbert-base': {'hit_rate': 0.8222222222222222,
  'mrr': 0.7496666666666667}}

# Evaluate model: nomic-ai/modernbert-embed-base

In [9]:
%%time

model_name = "nomic-ai/modernbert-embed-base"
model = SentenceTransformer(model_name)
model.name = model_name
result2 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", ["query", "opinion"])

result2.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/445k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

CPU times: user 2min 24s, sys: 2.72 s, total: 2min 27s
Wall time: 2min 47s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_stmt,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",contractual obligations in maritime law,What are the contract terms determined by the ...,"[[-0.013583629, 0.012640299, -0.05796513, -0.0...","[[0.019464662, 0.028625114, -0.04140967, -0.00...",True,1.0,"[0, 381, 381, 51, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,negligent retention employee sexual harassment,What are the legal requirements for a claim of...,"[[0.020469768, -0.053523075, -0.012231524, -0....","[[0.046831492, -0.0059427447, -0.0058012283, -...",True,1.0,"[1, 1, 1, 331, 331]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,ineffective assistance of appellate counsel in...,What factors are considered in determining ine...,"[[0.0061971527, -0.031037176, -0.0077903317, -...","[[0.05111113, -0.0065574097, 0.037518095, -0.0...",False,0.0,"[165, 382, 165, 165, 425]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,Roger Fincher's appeal of order denying rule 3...,What were the grounds for Roger Fincher's appe...,"[[0.06419095, 0.042688653, -0.0039515845, -0.0...","[[0.0555451, 0.016345548, 0.020141857, -0.0514...",True,1.0,"[3, 428, 55, 124, 195]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,Appeal from a conviction for aggravated sexual...,What criteria did the Idaho Supreme Court use ...,"[[0.0064177313, -0.028184146, -0.0533674, -0.0...","[[0.037381403, 0.023342576, 0.005147301, -0.00...",True,1.0,"[4, 4, 4, 4, 4]",4


In [10]:
result2.to_csv("4c.eval/QA-512/3.result2.csv", index=False)
len(result2)

450

In [11]:
EVALS

{'Alibaba-NLP/gte-modernbert-base': {'hit_rate': 0.8222222222222222,
  'mrr': 0.7496666666666667},
 'nomic-ai/modernbert-embed-base': {'hit_rate': 0.82, 'mrr': 0.764}}

# Evaluate model: lightonai/modernbert-embed-large

In [12]:
%%time

model_name = "lightonai/modernbert-embed-large"
model = SentenceTransformer(model_name)
model.name = model_name
result3 = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", ["query", "opinion"])

result3.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/227 [00:00<?, ?B/s]






README.md:   0%|          | 0.00/445k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

CPU times: user 5min 45s, sys: 4.94 s, total: 5min 50s
Wall time: 6min 29s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_stmt,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",contractual obligations in maritime law,What are the contract terms determined by the ...,"[[-0.011025174, 0.025882374, -0.017594809, -0....","[[0.019103566, 0.020169526, 0.0037484332, 0.00...",True,1.0,"[0, 389, 381, 381, 90]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,negligent retention employee sexual harassment,What are the legal requirements for a claim of...,"[[0.0696458, 0.010776641, -0.016950065, 0.0169...","[[0.07351214, -0.0015447837, 0.0025395944, -0....",True,1.0,"[1, 1, 1, 331, 331]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,ineffective assistance of appellate counsel in...,What factors are considered in determining ine...,"[[0.07218144, 0.007468114, -0.05248695, 0.0239...","[[0.057902057, 0.045159813, -0.03400408, -0.00...",False,0.0,"[165, 165, 425, 124, 382]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,Roger Fincher's appeal of order denying rule 3...,What were the grounds for Roger Fincher's appe...,"[[0.054012585, 0.024918031, 0.033004966, -0.01...","[[0.029235296, 0.042795237, 0.012074846, 0.004...",True,1.0,"[3, 428, 428, 428, 26]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,Appeal from a conviction for aggravated sexual...,What criteria did the Idaho Supreme Court use ...,"[[0.033399478, -0.013795876, 0.01056317, -0.02...","[[0.03189442, -0.014579395, 0.009217244, -0.02...",True,1.0,"[4, 4, 4, 4, 4]",4


In [13]:
result3.to_csv("4c.eval/QA-512/3.result3.csv", index=False)
len(result3)

450

In [14]:
EVALS

{'Alibaba-NLP/gte-modernbert-base': {'hit_rate': 0.8222222222222222,
  'mrr': 0.7496666666666667},
 'nomic-ai/modernbert-embed-base': {'hit_rate': 0.82, 'mrr': 0.764},
 'lightonai/modernbert-embed-large': {'hit_rate': 0.8488888888888889,
  'mrr': 0.7847777777777777}}