In [2]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/semantic_search/'

Mounted at /content/drive


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `FLP` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `FLP`


# Eval: Finetuned models with query-result set

This notebook outlines the steps undertook to evaluate finetuned models with the test set under query-result set-up.

# Import libraries

In [3]:
#%pip install nltk -q
#%pip install sentence_transformers -q
%pip install git+https://github.com/huggingface/transformers.git -q

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

from sklearn.metrics.pairwise import cosine_similarity

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Global variable

In [4]:
EVALS = {}
MAX_TOKENS = 8192

In [5]:
MODELS = ["rachelFLP/ModernBERT-base_finetune_8192",
          "nomic-ai/modernbert-embed-base",
          "Free-Law-Project/modernbert-embed-base_finetune_8192",
          "Free-Law-Project/modernbert-embed-base_finetune_512",
          ]

# Helper functions

In [6]:
def split_text_into_chunks(
    text,
    tokenizer,
    tag,
    max_tokens=MAX_TOKENS,
    num_overlap_sentences=0,
):
    # Split the text to sentences & encode sentences with tokenizer
    sentences = sent_tokenize(text)
    encoded_sentences = [
        tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences
    ]
    if tag == "opinion":
      lead_text = "search_document: "
    elif tag == "query":
      lead_text = "search_query: "
    else:
      lead_text = ""
    lead_tokens = tokenizer.encode(lead_text)
    lead_len = len(lead_tokens)
    chunks = []
    current_chunks: list[str] = []
    current_token_counts = len(lead_tokens)

    for sentence_tokens in encoded_sentences:
        sentence_len = len(sentence_tokens)
        # if the current sentence itself is above max_tokens
        if lead_len + sentence_len > max_tokens:
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))
            # truncate the sentence and store the truncated sentence as its own chunk
            truncated_sentence = tokenizer.decode(
                sentence_tokens[: (max_tokens - len(lead_tokens))]
            )
            chunks.append(lead_text + truncated_sentence)

            # start a new chunk with no overlap (because adding the current sentence will exceed the max_tokens)
            current_chunks = []
            current_token_counts = lead_len
            continue

        # if adding the new sentence will cause the chunk to exceed max_tokens
        if current_token_counts + sentence_len > max_tokens:
            overlap_sentences = current_chunks[-max(0, num_overlap_sentences) :]
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))

            overlap_token_counts = tokenizer.encode(
                " ".join(overlap_sentences), add_special_tokens=False
            )
            # If the sentence with the overlap exceeds the limit, start a new chunk without overlap.
            if lead_len + len(overlap_token_counts) + sentence_len > max_tokens:
                current_chunks = [tokenizer.decode(sentence_tokens)]
                current_token_counts = lead_len + sentence_len
            else:
                current_chunks = overlap_sentences + [tokenizer.decode(sentence_tokens)]
                current_token_counts = (
                    lead_len + len(overlap_token_counts) + sentence_len
                )
            continue

        # if within max_tokens, continue to add the new sentence to the current chunk
        current_chunks.append(tokenizer.decode(sentence_tokens))
        current_token_counts += len(sentence_tokens)

    # store the last chunk if it has any content
    if current_chunks:
        chunks.append(lead_text + " ".join(current_chunks))
    return chunks


def create_embeddings(df, model, text_column, embedding_column, tokenizer, tag):
    df[embedding_column] = None

    for idx, text in enumerate(df[text_column]):
        chunks = split_text_into_chunks(text, tokenizer, tag)  # Chunk the text
        chunk_embeddings = np.array(model.encode(chunks))  # Encode all chunks (shape: [num_chunks, embedding_dim])
        df.at[idx, embedding_column] = chunk_embeddings

    return df


def evaluate_retrieval(query_emb, doc_embs, relevant_idx, top_k=5):
    # Flatten the document chunks and keep track of the document they belong to
    all_chunk_embeddings = []
    doc_chunk_mapping = []  # Mapping from chunk index to document index
    for doc_idx, doc_chunks in enumerate(doc_embs):
        for chunk_emb in doc_chunks:
            all_chunk_embeddings.append(chunk_emb)
            doc_chunk_mapping.append(doc_idx)  # Store the document index for each chunk

    # Convert to numpy array for efficient computation
    all_chunk_embeddings = np.array(all_chunk_embeddings)  # Shape: [total_chunks, 768]

    # Compute cosine similarity between query and all document chunks
    similarities = cosine_similarity(all_chunk_embeddings, query_emb.reshape(1, -1))  # Shape: [total_chunks,]

    # Get indices of top-k most similar chunks
    top_k_chunk_indices = np.argsort(similarities.flatten())[::-1][:top_k]

    # Find the documents corresponding to the top-k chunks
    retrieved_docs = [doc_chunk_mapping[idx] for idx in top_k_chunk_indices]

    # Count the number of times the relevant document appears in the top-k retrieved documents
    relevant_document_hits = sum(1 for doc_idx in retrieved_docs if doc_idx == relevant_idx)

    # Compute rank (based on first appearance of the relevant document)
    rank = None
    for idx, doc_idx in enumerate(retrieved_docs):
        if doc_idx == relevant_idx:
            rank = idx + 1
            break

    # Compute metrics
    is_hit = relevant_document_hits > 0  # Hit if any of the top-k chunks belong to the relevant document
    mrr = 0 if rank is None else 1 / rank

    return {
        "is_hit": is_hit,
        "mrr": mrr,
        "retrieved": retrieved_docs,
        "expected": relevant_idx
    }


def evaluate_model(df, model, query_column, query_emb_column, opinion_column, opinion_emb_column, tokenizer, tag=["",""], top_k=5):
    # Step 1: Create embeddings for queries and opinions
    df = create_embeddings(df, model, query_column, query_emb_column, tokenizer, tag[0])
    df = create_embeddings(df, model, opinion_column, opinion_emb_column, tokenizer, tag[1])

    # Step 2: Evaluate retrieval for each query
    results_dict = {}
    for idx, query_row in df.iterrows():
        opinion_id = query_row["opinion_id"]
        query_emb = query_row[query_emb_column]
        doc_embs = df[opinion_emb_column]
        results_dict[opinion_id] = evaluate_retrieval(query_emb, doc_embs, idx, top_k=top_k)

    # Step 3: Convert results to DataFrame
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index': 'opinion_id'})

    # Step 4: Merge results with the original DataFrame
    final_df = df.merge(results_df, how="left", on="opinion_id")

    # Step 5: Calculate evaluation metrics (Hit Rate and MRR)
    model_name = model.name
    EVALS[model_name] = {}
    EVALS[model_name]["hit_rate"] = final_df["is_hit"].mean()
    EVALS[model_name]["mrr"] = final_df["mrr"].mean()

    return final_df

# Load the data

In [14]:
df = pd.read_csv(f"{path}outputs/3.test.csv")
df = df[["opinion_id", "opinion_word_count", "opinion", "relevant_query_qstn"]]
df.head()

Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...


# Evaluate model: 0: ModernBert-base model finetuned with opinion-relevant-irrelevant triples

In [11]:
%%time

model_name = MODELS[0]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/82.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/ModernBERT-base_finetune_8192
CPU times: user 5min 26s, sys: 35.6 s, total: 6min 2s
Wall time: 6min 19s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[2.5643613, 0.6627097, -1.3988396, -0.2837571...","[[1.3815454, -1.5031717, -0.5443015, 0.3904037...",True,0.333333,"[381, 303, 0, 102, 266]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.033446223, -1.4263237, 0.2083233, 1.53275...","[[-0.2550205, -4.854065, -1.4656973, -0.140800...",True,0.25,"[424, 91, 340, 1, 340]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.40148145, -2.8842595, 1.0155481, -0.43562...","[[0.577203, -2.89531, 1.4912584, 0.77268374, -...",True,0.5,"[382, 2, 257, 404, 272]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.72119296, -1.1531126, 1.1255331, 0.0450535...","[[0.5596888, -0.20450169, 0.023396859, 0.34782...",False,0.0,"[432, 317, 382, 48, 184]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.3844522, -1.6228797, -1.4664176, 0.3376447...","[[0.6587521, -1.1582263, -1.911036, -0.1500582...",False,0.0,"[317, 201, 354, 64, 432]",4


In [12]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [13]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': 0.4666666666666667,
  'mrr': 0.34937037037037033}}

# Evaluate model: 1: modernbert-embed-base model

In [14]:
%%time

model_name = MODELS[1]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/445k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

nomic-ai/modernbert-embed-base
CPU times: user 5min 17s, sys: 33.6 s, total: 5min 51s
Wall time: 5min 58s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.013583629, 0.012640299, -0.05796513, -0.0...","[[0.019464662, 0.028625114, -0.04140967, -0.00...",True,1.0,"[0, 381, 51, 448, 379]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.020469768, -0.053523075, -0.012231524, -0....","[[0.033707134, -0.02598026, -0.0010506205, -0....",True,1.0,"[1, 331, 226, 83, 195]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.0061971527, -0.031037176, -0.0077903317, -...","[[0.05020914, -0.0071834656, 0.037420075, -0.0...",False,0.0,"[382, 165, 32, 195, 79]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.06419095, 0.042688653, -0.0039515845, -0.0...","[[0.0555451, 0.016345548, 0.020141857, -0.0514...",True,1.0,"[3, 428, 257, 432, 343]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.0064177313, -0.028184146, -0.0533674, -0.0...","[[0.01522027, -0.012950331, -0.013770975, 0.00...",True,1.0,"[4, 62, 262, 268, 90]",4


In [15]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [16]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': 0.4666666666666667,
  'mrr': 0.34937037037037033},
 'modernbert-embed-base': {'hit_rate': 0.8555555555555555,
  'mrr': 0.7876296296296297}}

# Evaluate model: 2: modernbert-embed-base model finetuned with opinion-relevant-irrelevant triples with 8192 chunk size

In [17]:
%%time

model_name = MODELS[2]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/82.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Free-Law-Project/modernbert-embed-base_finetune_8192
CPU times: user 5min 15s, sys: 31.1 s, total: 5min 47s
Wall time: 6min 5s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.038488694, 0.025994353, -0.06339536, 0.00...","[[-0.006580794, 0.03178006, -0.039918218, 0.03...",True,1.0,"[0, 266, 51, 90, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.0011685638, -0.036934253, -0.011052243, -0...","[[-0.017674824, -0.045836374, -0.0016012659, 0...",True,1.0,"[1, 331, 83, 424, 226]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.0005217498, -0.03847461, -0.022002095, -0...","[[-0.006874681, -0.025027705, 0.03674899, -0.0...",False,0.0,"[382, 165, 425, 124, 367]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.05497853, 0.054073066, -0.010794572, -0.03...","[[0.038401626, 0.01796525, 0.026556812, -0.040...",True,1.0,"[3, 432, 167, 404, 280]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.009887146, -0.013280726, -0.0806074, -0.0...","[[-0.019492554, 0.022011528, -0.04384884, -0.0...",True,1.0,"[4, 201, 365, 151, 139]",4


In [18]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [19]:
EVALS

{'ModernBERT-base_finetune_8192': {'hit_rate': 0.4666666666666667,
  'mrr': 0.34937037037037033},
 'modernbert-embed-base': {'hit_rate': 0.8555555555555555,
  'mrr': 0.7876296296296297},
 'modernbert-embed-base_finetune_8192': {'hit_rate': 0.8644444444444445,
  'mrr': 0.79}}

# Evaluate model: 3: modernbert-embed-base model finetuned with opinion-relevant-irrelevant triples with 512 chunk size

In [15]:
%%time

model_name = MODELS[3]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

Free-Law-Project/modernbert-embed-base_finetune_512
CPU times: user 5min 1s, sys: 22.4 s, total: 5min 24s
Wall time: 5min 32s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.014296135, 0.023057982, -0.048878007, 0.0...","[[0.015871974, 0.018732293, -0.024121562, 0.06...",True,1.0,"[0, 51, 266, 90, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.010450049, -0.067059316, 0.002360706, 0.00...","[[0.002876464, -0.036063917, 0.015987372, 0.01...",True,1.0,"[1, 226, 83, 331, 219]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.001758455, -0.04744919, -0.021128822, -0....","[[0.0073035755, -0.02433301, 0.06404363, -2.13...",False,0.0,"[382, 165, 425, 367, 124]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.06206147, 0.039539136, -0.0043731686, -0.0...","[[0.026955968, -0.012011581, 0.02601028, -0.00...",True,1.0,"[3, 428, 300, 139, 432]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.0027054045, -0.031128146, -0.06615934, -0...","[[-0.0050694593, 0.02011039, -0.03187381, 0.01...",True,1.0,"[4, 151, 365, 201, 192]",4


In [16]:
result.to_csv(f"{path}evals/{model.name}_8192.csv", index=False)
len(result)

450

In [17]:
EVALS

{'modernbert-embed-base_finetune_512': {'hit_rate': 0.8711111111111111,
  'mrr': 0.789037037037037}}