# Eval: Finetuned models with query-result set

This notebook outlines the steps undertook to evaluate finetuned models with the test set under query-result set-up.

# Import libraries

In [1]:
#%pip install nltk -q
#%pip install sentence_transformers -q
#%pip install git+https://github.com/huggingface/transformers.git -q

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
import torch

torch.set_float32_matmul_precision('high')

In [3]:
path = ""

# Global variable

In [4]:
EVALS = {}
MAX_TOKENS = 8192

In [5]:
MODELS = ["rachelFLP/ModernBERT-base_finetune_8192",
          "nomic-ai/modernbert-embed-base",
          "rachelFLP/modernbert-embed-base_finetune_8192",
          ]

# Helper functions

In [6]:
def split_text_into_chunks(
    text,
    tokenizer,
    tag,
    max_tokens=MAX_TOKENS,
    num_overlap_sentences=0,
):
    # Split the text to sentences & encode sentences with tokenizer
    sentences = sent_tokenize(text)
    encoded_sentences = [
        tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences
    ]
    if tag == "opinion":
      lead_text = "search_document: "
    elif tag == "query":
      lead_text = "search_query: "
    else:
      lead_text = ""
    lead_tokens = tokenizer.encode(lead_text)
    lead_len = len(lead_tokens)
    chunks = []
    current_chunks: list[str] = []
    current_token_counts = len(lead_tokens)

    for sentence_tokens in encoded_sentences:
        sentence_len = len(sentence_tokens)
        # if the current sentence itself is above max_tokens
        if lead_len + sentence_len > max_tokens:
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))
            # truncate the sentence and store the truncated sentence as its own chunk
            truncated_sentence = tokenizer.decode(
                sentence_tokens[: (max_tokens - len(lead_tokens))]
            )
            chunks.append(lead_text + truncated_sentence)

            # start a new chunk with no overlap (because adding the current sentence will exceed the max_tokens)
            current_chunks = []
            current_token_counts = lead_len
            continue

        # if adding the new sentence will cause the chunk to exceed max_tokens
        if current_token_counts + sentence_len > max_tokens:
            overlap_sentences = current_chunks[-max(0, num_overlap_sentences) :]
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))

            overlap_token_counts = tokenizer.encode(
                " ".join(overlap_sentences), add_special_tokens=False
            )
            # If the sentence with the overlap exceeds the limit, start a new chunk without overlap.
            if lead_len + len(overlap_token_counts) + sentence_len > max_tokens:
                current_chunks = [tokenizer.decode(sentence_tokens)]
                current_token_counts = lead_len + sentence_len
            else:
                current_chunks = overlap_sentences + [tokenizer.decode(sentence_tokens)]
                current_token_counts = (
                    lead_len + len(overlap_token_counts) + sentence_len
                )
            continue

        # if within max_tokens, continue to add the new sentence to the current chunk
        current_chunks.append(tokenizer.decode(sentence_tokens))
        current_token_counts += len(sentence_tokens)

    # store the last chunk if it has any content
    if current_chunks:
        chunks.append(lead_text + " ".join(current_chunks))
    return chunks


def create_embeddings(df, model, text_column, embedding_column, tokenizer, tag):
    df[embedding_column] = None

    for idx, text in enumerate(df[text_column]):
        chunks = split_text_into_chunks(text, tokenizer, tag)  # Chunk the text
        chunk_embeddings = np.array(model.encode(chunks))  # Encode all chunks (shape: [num_chunks, embedding_dim])
        df.at[idx, embedding_column] = chunk_embeddings

    return df


def evaluate_retrieval(query_emb, doc_embs, relevant_idx, top_k=5):
    # Flatten the document chunks and keep track of the document they belong to
    all_chunk_embeddings = []
    doc_chunk_mapping = []  # Mapping from chunk index to document index
    for doc_idx, doc_chunks in enumerate(doc_embs):
        for chunk_emb in doc_chunks:
            all_chunk_embeddings.append(chunk_emb)
            doc_chunk_mapping.append(doc_idx)  # Store the document index for each chunk

    # Convert to numpy array for efficient computation
    all_chunk_embeddings = np.array(all_chunk_embeddings)  # Shape: [total_chunks, 768]

    # Compute cosine similarity between query and all document chunks
    similarities = np.dot(all_chunk_embeddings, query_emb.T)  # Shape: [total_chunks,]

    # Get indices of top-k most similar chunks
    top_k_chunk_indices = np.argsort(similarities.flatten())[::-1][:top_k]

    # Find the documents corresponding to the top-k chunks
    retrieved_docs = [doc_chunk_mapping[idx] for idx in top_k_chunk_indices]

    # Count the number of times the relevant document appears in the top-k retrieved documents
    relevant_document_hits = sum(1 for doc_idx in retrieved_docs if doc_idx == relevant_idx)

    # Compute rank (based on first appearance of the relevant document)
    rank = None
    for idx, doc_idx in enumerate(retrieved_docs):
        if doc_idx == relevant_idx:
            rank = idx + 1
            break

    # Compute metrics
    is_hit = relevant_document_hits > 0  # Hit if any of the top-k chunks belong to the relevant document
    mrr = 0 if rank is None else 1 / rank

    return {
        "is_hit": is_hit,
        "mrr": mrr,
        "retrieved": retrieved_docs,
        "expected": relevant_idx
    }


def evaluate_model(df, model, query_column, query_emb_column, opinion_column, opinion_emb_column, tokenizer, tag=["",""], top_k=5):
    # Step 1: Create embeddings for queries and opinions
    df = create_embeddings(df, model, query_column, query_emb_column, tokenizer, tag[0])
    df = create_embeddings(df, model, opinion_column, opinion_emb_column, tokenizer, tag[1])

    # Step 2: Evaluate retrieval for each query
    results_dict = {}
    for idx, query_row in df.iterrows():
        opinion_id = query_row["opinion_id"]
        query_emb = query_row[query_emb_column]
        doc_embs = df[opinion_emb_column]
        results_dict[opinion_id] = evaluate_retrieval(query_emb, doc_embs, idx, top_k=top_k)

    # Step 3: Convert results to DataFrame
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index': 'opinion_id'})

    # Step 4: Merge results with the original DataFrame
    final_df = df.merge(results_df, how="left", on="opinion_id")

    # Step 5: Calculate evaluation metrics (Hit Rate and MRR)
    model_name = model.name
    EVALS[model_name] = {}
    EVALS[model_name]["hit_rate"] = final_df["is_hit"].mean()
    EVALS[model_name]["mrr"] = final_df["mrr"].mean()

    return final_df

# Load the data

In [7]:
df = pd.read_csv(f"{path}outputs/3.test.csv")
df = df[["opinion_id", "opinion_word_count", "opinion", "relevant_query_qstn"]]
df.head()

Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...


# Evaluate model: 0: ModernBert-base model finetuned with opinion-relevant-irrelevant triples

In [8]:
%%time

model_name = MODELS[0]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

rachelFLP/ModernBERT-base_finetune_8192
CPU times: user 4min 20s, sys: 39.8 s, total: 4min 59s
Wall time: 2min 25s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[2.5638871, 0.6627561, -1.3971672, -0.2838986...","[[1.3819762, -1.504059, -0.544046, 0.39151987,...",True,0.333333,"[381, 303, 0, 102, 180]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.033046007, -1.4266969, 0.20892495, 1.5320...","[[-0.25372165, -4.8524227, -1.4646095, -0.1410...",True,0.5,"[91, 1, 409, 424, 340]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.40223733, -2.8828785, 1.0157149, -0.43669...","[[0.5774427, -2.8944128, 1.4891303, 0.7703127,...",True,0.5,"[382, 2, 272, 86, 404]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.7203469, -1.1517618, 1.1245394, 0.04372296...","[[0.5601711, -0.20293824, 0.02383344, 0.346193...",False,0.0,"[432, 382, 317, 66, 365]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.3845926, -1.6225652, -1.4651481, 0.3367595...","[[0.65774643, -1.156528, -1.9094923, -0.151209...",False,0.0,"[139, 432, 365, 393, 317]",4


In [9]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [10]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': np.float64(0.43777777777777777),
  'mrr': np.float64(0.32307407407407407)}}

# Evaluate model: 1: modernbert-embed-base model

In [11]:
%%time

model_name = MODELS[1]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

nomic-ai/modernbert-embed-base
CPU times: user 4min 15s, sys: 37.8 s, total: 4min 52s
Wall time: 2min 18s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.013560965, 0.012559527, -0.057989158, -0....","[[0.019503355, 0.028625635, -0.04142782, -0.00...",True,1.0,"[0, 381, 51, 448, 379]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.02048242, -0.053533725, -0.012215352, -0.0...","[[0.03369947, -0.025986608, -0.0010567623, -0....",True,1.0,"[1, 331, 226, 83, 195]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.0061701285, -0.031028835, -0.00782321, -0....","[[0.05021046, -0.0071698213, 0.03739213, -0.01...",False,0.0,"[382, 165, 32, 195, 79]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.06422608, 0.042672083, -0.004002746, -0.02...","[[0.0555354, 0.016302917, 0.020129483, -0.0514...",True,1.0,"[3, 428, 257, 432, 343]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.0064094584, -0.028253581, -0.05337392, -0....","[[0.015205741, -0.013018959, -0.0137801925, 0....",True,1.0,"[4, 62, 262, 268, 90]",4


In [12]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [13]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': np.float64(0.43777777777777777),
  'mrr': np.float64(0.32307407407407407)},
 'modernbert-embed-base': {'hit_rate': np.float64(0.8555555555555555),
  'mrr': np.float64(0.7855925925925925)}}

# Evaluate model: 2: modernbert-embed-base model finetuned with opinion-relevant-irrelevant triples

In [14]:
%%time

model_name = MODELS[2]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/82.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/modernbert-embed-base_finetune_8192
CPU times: user 4min 14s, sys: 39.2 s, total: 4min 54s
Wall time: 2min 34s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.038522094, 0.026001949, -0.063386984, 0.0...","[[-0.006620476, 0.031802226, -0.03996083, 0.03...",True,1.0,"[0, 266, 51, 90, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.0011403061, -0.036904346, -0.011095194, -0...","[[-0.017700361, -0.045799013, -0.0016478369, 0...",True,1.0,"[1, 331, 83, 424, 226]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.00057109166, -0.03841867, -0.021977901, -...","[[-0.0069042654, -0.024969574, 0.036690928, -0...",False,0.0,"[382, 165, 425, 124, 367]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.055043858, 0.05408281, -0.010780606, -0.03...","[[0.038395446, 0.018035743, 0.026552506, -0.04...",True,1.0,"[3, 432, 167, 404, 280]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.009929182, -0.013301407, -0.080606245, -0...","[[-0.019526195, 0.022043526, -0.043910883, -0....",True,1.0,"[4, 201, 365, 151, 139]",4


In [15]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [16]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': np.float64(0.43777777777777777),
  'mrr': np.float64(0.32307407407407407)},
 'modernbert-embed-base': {'hit_rate': np.float64(0.8555555555555555),
  'mrr': np.float64(0.7855925925925925)},
 'modernbert-embed-base_finetune_8192': {'hit_rate': np.float64(0.8644444444444445),
  'mrr': np.float64(0.7885185185185185)}}