In [1]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/semantic_search/'

Mounted at /content/drive


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `FLP` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `FLP`


# Eval: Finetuned models with query-result set

This notebook outlines the steps undertook to evaluate finetuned models with the test set under query-result set-up.

# Import libraries

In [3]:
#%pip install nltk -q
#%pip install sentence_transformers -q
%pip install git+https://github.com/huggingface/transformers.git -q

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

from sklearn.metrics.pairwise import cosine_similarity

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Global variable

In [4]:
EVALS = {}
MAX_TOKENS = 512

In [5]:
MODELS = ["rachelFLP/bert-base-cased_triples",
          "rachelFLP/roberta-base_triples",
          "rachelFLP/mpnet-base_triples",
          "rachelFLP/ModernBERT-base_triples",
          "rachelFLP/kl3m-doc-pico-001_triples",
          "rachelFLP/kl3m-doc-nano-001_triples",
          "rachelFLP/kl3m-doc-micro-uncased-001_triples",
          "rachelFLP/kl3m-doc-small-uncased-001_triples",
          "nomic-ai/modernbert-embed-base",
          "Free-Law-Project/modernbert-embed-base_finetune_512",
          ]

# Helper functions

In [10]:
def split_text_into_chunks(
    text,
    tokenizer,
    tag,
    max_tokens=MAX_TOKENS,
    num_overlap_sentences=0,
):
    # Split the text to sentences & encode sentences with tokenizer
    sentences = sent_tokenize(text)
    encoded_sentences = [
        tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences
    ]
    if tag == "opinion":
      lead_text = "search_document: "
    elif tag == "query":
      lead_text = "search_query: "
    else:
      lead_text = ""
    lead_tokens = tokenizer.encode(lead_text)
    lead_len = len(lead_tokens)
    chunks = []
    current_chunks: list[str] = []
    current_token_counts = len(lead_tokens)

    for sentence_tokens in encoded_sentences:
        sentence_len = len(sentence_tokens)
        # if the current sentence itself is above max_tokens
        if lead_len + sentence_len > max_tokens:
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))
            # truncate the sentence and store the truncated sentence as its own chunk
            truncated_sentence = tokenizer.decode(
                sentence_tokens[: (max_tokens - len(lead_tokens))]
            )
            chunks.append(lead_text + truncated_sentence)

            # start a new chunk with no overlap (because adding the current sentence will exceed the max_tokens)
            current_chunks = []
            current_token_counts = lead_len
            continue

        # if adding the new sentence will cause the chunk to exceed max_tokens
        if current_token_counts + sentence_len > max_tokens:
            overlap_sentences = current_chunks[-max(0, num_overlap_sentences) :]
            # store the previous chunk
            if current_chunks:
                chunks.append(lead_text + " ".join(current_chunks))

            overlap_token_counts = tokenizer.encode(
                " ".join(overlap_sentences), add_special_tokens=False
            )
            # If the sentence with the overlap exceeds the limit, start a new chunk without overlap.
            if lead_len + len(overlap_token_counts) + sentence_len > max_tokens:
                current_chunks = [tokenizer.decode(sentence_tokens)]
                current_token_counts = lead_len + sentence_len
            else:
                current_chunks = overlap_sentences + [tokenizer.decode(sentence_tokens)]
                current_token_counts = (
                    lead_len + len(overlap_token_counts) + sentence_len
                )
            continue

        # if within max_tokens, continue to add the new sentence to the current chunk
        current_chunks.append(tokenizer.decode(sentence_tokens))
        current_token_counts += len(sentence_tokens)

    # store the last chunk if it has any content
    if current_chunks:
        chunks.append(lead_text + " ".join(current_chunks))
    return chunks


def create_embeddings(df, model, text_column, embedding_column, tokenizer, tag):
    df[embedding_column] = None

    for idx, text in enumerate(df[text_column]):
        chunks = split_text_into_chunks(text, tokenizer, tag)  # Chunk the text
        chunk_embeddings = np.array(model.encode(chunks))  # Encode all chunks (shape: [num_chunks, embedding_dim])
        df.at[idx, embedding_column] = chunk_embeddings

    return df


def evaluate_retrieval(query_emb, doc_embs, relevant_idx, top_k=5):
    # Flatten the document chunks and keep track of the document they belong to
    all_chunk_embeddings = []
    doc_chunk_mapping = []  # Mapping from chunk index to document index
    for doc_idx, doc_chunks in enumerate(doc_embs):
        for chunk_emb in doc_chunks:
            all_chunk_embeddings.append(chunk_emb)
            doc_chunk_mapping.append(doc_idx)  # Store the document index for each chunk

    # Convert to numpy array for efficient computation
    all_chunk_embeddings = np.array(all_chunk_embeddings)  # Shape: [total_chunks, 768]

    # Compute cosine similarity between query and all document chunks
    similarities = cosine_similarity(all_chunk_embeddings, query_emb.reshape(1, -1))  # Shape: [total_chunks,]

    # Get indices of top-k most similar chunks
    top_k_chunk_indices = np.argsort(similarities.flatten())[::-1][:top_k]

    # Find the documents corresponding to the top-k chunks
    retrieved_docs = [doc_chunk_mapping[idx] for idx in top_k_chunk_indices]

    # Count the number of times the relevant document appears in the top-k retrieved documents
    relevant_document_hits = sum(1 for doc_idx in retrieved_docs if doc_idx == relevant_idx)

    # Compute rank (based on first appearance of the relevant document)
    rank = None
    for idx, doc_idx in enumerate(retrieved_docs):
        if doc_idx == relevant_idx:
            rank = idx + 1
            break

    # Compute metrics
    is_hit = relevant_document_hits > 0  # Hit if any of the top-k chunks belong to the relevant document
    mrr = 0 if rank is None else 1 / rank

    return {
        "is_hit": is_hit,
        "mrr": mrr,
        "retrieved": retrieved_docs,
        "expected": relevant_idx
    }


def evaluate_model(df, model, query_column, query_emb_column, opinion_column, opinion_emb_column, tokenizer, tag=["",""], top_k=5):
    # Step 1: Create embeddings for queries and opinions
    df = create_embeddings(df, model, query_column, query_emb_column, tokenizer, tag[0])
    df = create_embeddings(df, model, opinion_column, opinion_emb_column, tokenizer, tag[1])

    # Step 2: Evaluate retrieval for each query
    results_dict = {}
    for idx, query_row in df.iterrows():
        opinion_id = query_row["opinion_id"]
        query_emb = query_row[query_emb_column]
        doc_embs = df[opinion_emb_column]
        results_dict[opinion_id] = evaluate_retrieval(query_emb, doc_embs, idx, top_k=top_k)

    # Step 3: Convert results to DataFrame
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index': 'opinion_id'})

    # Step 4: Merge results with the original DataFrame
    final_df = df.merge(results_df, how="left", on="opinion_id")

    # Step 5: Calculate evaluation metrics (Hit Rate and MRR)
    model_name = model.name
    EVALS[model_name] = {}
    EVALS[model_name]["hit_rate"] = final_df["is_hit"].mean()
    EVALS[model_name]["mrr"] = final_df["mrr"].mean()

    return final_df

# Load the data

In [7]:
df = pd.read_csv(f"{path}outputs/3.test.csv")
df = df[["opinion_id", "opinion_word_count", "opinion", "relevant_query_qstn"]]
df.head()

Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...


# Evaluate model: 0: bert-base-cased model finetuned with opinion-relevant-irrelevant triples

In [11]:
%%time

model_name = MODELS[0]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

rachelFLP/bert-base-cased_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 1min 37s, sys: 3.96 s, total: 1min 41s
Wall time: 1min 47s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.5604951, -0.5682717, -0.42372972, -0.1005...","[[-0.2815812, -0.5426244, -0.8428008, 0.101225...",True,1.0,"[0, 268, 268, 320, 51]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.066489436, 0.12344476, -0.030956944, 0.40...","[[-0.6926346, -0.109642476, -0.38645566, 0.463...",True,1.0,"[1, 1, 1, 150, 1]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.17396116, 0.09632499, 0.30900046, 0.424562...","[[-0.57039654, 0.03223448, -0.16127081, -0.114...",False,0.0,"[124, 165, 425, 95, 124]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.26786512, 0.37381008, 0.41785666, 0.524606...","[[-0.37491792, -0.00761722, 0.30044338, 0.4604...",False,0.0,"[428, 167, 55, 48, 97]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.52504593, -0.19737332, -0.44039422, 0.008...","[[-0.58656806, -0.17022498, -0.8783647, 0.2986...",True,1.0,"[4, 439, 4, 398, 398]",4


In [12]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [13]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852}}

# Evaluate model: 1: roberta-base model finetuned with opinion-relevant-irrelevant triples

In [14]:
%%time

model_name = MODELS[1]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/roberta-base_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 1min 33s, sys: 4.66 s, total: 1min 38s
Wall time: 1min 41s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[0.12235043, 0.7991967, -0.4387804, -0.095118...","[[0.45701382, -0.20729996, -0.20694448, -0.890...",False,0.0,"[269, 125, 320, 445, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.071065485, -0.38229874, -0.6681603, 0.640...","[[-0.15423554, -0.32980108, -0.094444975, -0.0...",True,1.0,"[1, 1, 1, 112, 299]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.025995037, 0.9598399, -0.70388955, -0.6584...","[[0.3146456, 0.48082814, -0.81156856, -0.54083...",False,0.0,"[165, 382, 165, 124, 36]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.46136728, 0.6456394, 0.113504134, -0.30883...","[[-0.07286872, 0.7367974, -0.10181502, 0.09481...",False,0.0,"[144, 280, 172, 428, 449]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.02621448, 0.11229827, -0.3656495, 0.87926...","[[-0.103557736, -0.24535678, -0.38227755, -0.2...",True,1.0,"[4, 4, 234, 4, 4]",4


In [15]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [16]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147}}

# Evaluate model: 2: mpnet-base model finetuned with opinion-relevant-irrelevant triples

In [17]:
%%time

model_name = MODELS[2]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/962 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/mpnet-base_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (909 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 1min 49s, sys: 4.58 s, total: 1min 54s
Wall time: 1min 54s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[0.20061429, -0.17292652, 0.08147385, -0.0936...","[[0.027735796, 0.11984342, 0.033879086, 0.0280...",True,1.0,"[0, 444, 320, 445, 303]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.035472825, -0.04983096, 0.0708093, -0.020...","[[-0.06607168, -0.119325526, 0.08705791, -0.08...",True,1.0,"[1, 1, 1, 1, 185]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.04667266, -0.19336884, -0.03506855, -0.15...","[[0.02363703, 0.019902337, -0.0021922544, -0.0...",False,0.0,"[165, 165, 425, 124, 124]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[-0.111113615, 0.1342793, 0.040131383, 0.0755...","[[-0.13399151, -0.1597369, -0.08819693, -0.002...",False,0.0,"[48, 167, 427, 428, 432]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.0081493, 0.014485013, 0.009500303, -0.093...","[[-0.038070887, -0.027354574, 0.037183948, 0.1...",True,0.5,"[151, 4, 347, 33, 4]",4


In [18]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [19]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667}}

# Evaluate model: 3: ModernBERT-base model finetuned with opinion-relevant-irrelevant triples

In [20]:
%%time

model_name = MODELS[3]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/ModernBERT-base_triples
CPU times: user 2min 21s, sys: 5.66 s, total: 2min 26s
Wall time: 2min 32s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[1.5839342, 0.38030672, -0.71274525, 0.506403...","[[0.88351125, -0.7039482, -0.79185677, 1.02719...",True,1.0,"[0, 381, 320, 320, 90]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.39053506, -2.967422, -0.22246912, 0.07459...","[[-0.1550501, -3.4681559, -1.182842, -0.380254...",True,1.0,"[1, 1, 1, 129, 349]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.39034864, -3.0814908, 0.04091424, -0.3502...","[[0.4701424, -2.3404322, 1.5179944, 1.1516578,...",False,0.0,"[124, 165, 165, 124, 124]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.5225428, -0.9102471, 0.46725947, -0.119273...","[[0.94450945, -0.04835993, 0.8498561, 0.472619...",False,0.0,"[292, 428, 404, 55, 48]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.19328958, -0.45519474, -0.876201, 1.066505...","[[-0.24261001, 0.5478232, -0.66721654, 0.44947...",True,1.0,"[4, 377, 427, 71, 345]",4


In [21]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [22]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075}}

# Evaluate model: 4: kl3m-doc-pico-001 model finetuned with opinion-relevant-irrelevant triples

In [23]:
%%time

model_name = MODELS[4]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/kl3m-doc-pico-001_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (510 > 509). Running this sequence through the model will result in indexing errors


CPU times: user 44.2 s, sys: 2.8 s, total: 47 s
Wall time: 47.7 s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[0.060723744, 0.49513876, 0.020905495, 0.7602...","[[0.2881784, -0.39738512, 0.103872284, 0.25998...",True,0.5,"[445, 0, 326, 445, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.13346417, -0.4045122, -0.5191633, 0.085490...","[[-0.4172397, -0.27522707, 0.29064676, -0.1050...",False,0.0,"[409, 240, 283, 351, 408]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.32489958, 0.08828222, 0.6556719, 0.877495...","[[-0.73382765, -0.034172483, 0.16630241, 0.240...",False,0.0,"[329, 342, 440, 268, 50]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[-0.06835629, 0.1924952, 0.31833044, 0.649159...","[[-0.5483453, 0.08277343, 0.20760824, 0.118178...",False,0.0,"[87, 292, 226, 335, 195]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.3780039, 0.2177974, 0.11950155, 0.8910064...","[[-0.36101142, -0.15517639, 0.30730638, -0.228...",False,0.0,"[130, 226, 317, 151, 335]",4


In [24]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [25]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816}}

# Evaluate model: 5: kl3m-doc-nano-001_triples model finetuned with opinion-relevant-irrelevant triples

In [26]:
%%time

model_name = MODELS[5]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/338M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/kl3m-doc-nano-001_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (510 > 509). Running this sequence through the model will result in indexing errors


CPU times: user 49.3 s, sys: 3.64 s, total: 52.9 s
Wall time: 52.8 s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.38035473, 0.1704803, -0.23296735, -0.7904...","[[-0.24644323, -0.08973298, 0.2081646, -0.2748...",True,1.0,"[0, 125, 445, 243, 237]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.42892933, 0.18428469, 1.1994188, -0.436067...","[[0.19492158, 0.20310304, 0.29263204, -0.10553...",True,1.0,"[1, 1, 1, 226, 112]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.74950534, -0.66120434, 1.3440684, 0.04810...","[[-0.20073524, -0.3008011, 0.595577, 0.61568, ...",False,0.0,"[165, 124, 382, 165, 165]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.059686016, -0.47313514, 0.38379338, 0.2725...","[[-0.07853746, -0.26830977, 0.32756826, 0.6038...",True,0.2,"[48, 64, 324, 296, 3]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-1.3920789, 0.174164, -0.050501734, 0.025144...","[[-0.11923864, 0.13029405, 0.39074862, -0.0694...",True,1.0,"[4, 4, 55, 71, 234]",4


In [27]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [28]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816},
 'kl3m-doc-nano-001_triples': {'hit_rate': 0.6266666666666667,
  'mrr': 0.5165185185185185}}

# Evaluate model: 6: kl3m-doc-micro-uncased-001 model finetuned with opinion-relevant-irrelevant triples

In [29]:
%%time

model_name = MODELS[6]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

rachelFLP/kl3m-doc-micro-uncased-001_triples


Token indices sequence length is longer than the specified maximum sequence length for this model (510 > 509). Running this sequence through the model will result in indexing errors


CPU times: user 2min 30s, sys: 3.8 s, total: 2min 34s
Wall time: 2min 42s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[0.06609806, 0.15650739, 0.8845874, -0.456246...","[[0.30756426, -0.10917255, 0.6478285, -0.48896...",True,1.0,"[0, 445, 445, 125, 266]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[-0.31961763, -2.0474916, 0.0704616, 0.235030...","[[-0.038479112, -0.81549394, 0.09890802, -0.35...",True,1.0,"[1, 1, 226, 1, 112]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.60730964, -1.3598708, -0.2352957, 0.014897...","[[0.5927514, 0.38098848, 0.74978983, 0.1108560...",False,0.0,"[124, 95, 165, 124, 425]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.22543006, -0.5753912, 0.33718637, -0.58187...","[[0.78908575, 0.03484965, 1.1539915, -0.096919...",False,0.0,"[292, 172, 432, 167, 280]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.35426646, 0.6224582, -0.16497678, -0.45978...","[[0.043369133, 0.04450633, 0.18164057, -0.7423...",True,1.0,"[4, 4, 4, 377, 180]",4


In [30]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [31]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816},
 'kl3m-doc-nano-001_triples': {'hit_rate': 0.6266666666666667,
  'mrr': 0.5165185185185185},
 'kl3m-doc-micro-uncased-001_triples': {'hit_rate': 0.5733333333333334,
  'mrr': 0.4606296296296296}}

# Evaluate model: 7: kl3m-doc-small-uncased-001 model finetuned with opinion-relevant-irrelevant triples

In [32]:
%%time

model_name = MODELS[7]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer)

result.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/40.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

rachelFLP/kl3m-doc-small-uncased-001_triples
CPU times: user 1min 44s, sys: 6.15 s, total: 1min 50s
Wall time: 1min 59s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.09710871, 0.78138417, -0.50889087, -0.394...","[[0.751955, 0.5351439, -0.12059396, -0.1412263...",True,0.5,"[320, 0, 125, 381, 445]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[1.0511086, -0.39643735, 0.33842674, 0.334106...","[[0.22945194, -0.2900359, 0.021890197, -0.0138...",True,1.0,"[1, 1, 1, 112, 418]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.19784157, 0.3538547, 1.1340129, -0.1195772...","[[0.08507964, 0.25164342, 0.53938717, 0.953317...",False,0.0,"[124, 165, 165, 425, 382]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.59324825, 0.09211266, -0.034152877, -0.140...","[[0.3348897, 0.025593862, 0.40629455, 0.324572...",False,0.0,"[292, 292, 48, 298, 335]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.15157337, 0.8240001, -0.039047275, -0.0077...","[[-0.26888886, -0.13873287, 0.064824596, 0.006...",True,1.0,"[4, 4, 4, 4, 55]",4


In [33]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [34]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816},
 'kl3m-doc-nano-001_triples': {'hit_rate': 0.6266666666666667,
  'mrr': 0.5165185185185185},
 'kl3m-doc-micro-uncased-001_triples': {'hit_rate': 0.5733333333333334,
  'mrr': 0.4606296296296296},
 'kl3m-doc-small-uncased-001_triples': {'hit_rate': 0.5933333333333334,
  'mrr': 0.48740740740740734}}

# Evaluate model: 8: modernbert-embed-base model

In [35]:
%%time

model_name = MODELS[8]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/445k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

nomic-ai/modernbert-embed-base
CPU times: user 2min 16s, sys: 5.09 s, total: 2min 21s
Wall time: 2min 22s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.013583629, 0.012640299, -0.05796513, -0.0...","[[0.019464662, 0.028625114, -0.04140967, -0.00...",True,1.0,"[0, 51, 266, 381, 387]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.020469768, -0.053523075, -0.012231524, -0....","[[0.047770616, -0.007053333, -0.014405092, -0....",True,1.0,"[1, 1, 1, 15, 439]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[0.0061971527, -0.031037176, -0.0077903317, -...","[[0.05020914, -0.0071834656, 0.037420075, -0.0...",False,0.0,"[165, 300, 165, 165, 425]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.06419095, 0.042688653, -0.0039515845, -0.0...","[[0.0555451, 0.016345548, 0.020141857, -0.0514...",True,1.0,"[3, 124, 428, 300, 257]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[0.0064177313, -0.028184146, -0.0533674, -0.0...","[[0.035457145, 0.024352847, 0.0056594135, -0.0...",True,1.0,"[4, 4, 4, 4, 4]",4


In [36]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [37]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816},
 'kl3m-doc-nano-001_triples': {'hit_rate': 0.6266666666666667,
  'mrr': 0.5165185185185185},
 'kl3m-doc-micro-uncased-001_triples': {'hit_rate': 0.5733333333333334,
  'mrr': 0.4606296296296296},
 'kl3m-doc-small-uncased-001_triples': {'hit_rate': 0.5933333333333334,
  'mrr': 0.48740740740740734},
 'modernbert-embed-base': {'hit_rate': 0.8355555555555556,
  'mrr': 0.7802962962962963}}

# Evaluate model: 9: modernbert-embed-base model finetuned with opinion-relevant-irrelevant triples

In [38]:
%%time

model_name = MODELS[9]
model = SentenceTransformer(model_name)
model.name = model_name.split("/")[-1]
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
result = evaluate_model(df, model, "relevant_query_qstn", "query_embeddings", "opinion", "opinion_embeddings", tokenizer, ["query", "opinion"])

result.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Free-Law-Project/modernbert-embed-base_finetune_512
CPU times: user 2min 16s, sys: 5.25 s, total: 2min 21s
Wall time: 2min 21s


Unnamed: 0,opinion_id,opinion_word_count,opinion,relevant_query_qstn,query_embeddings,opinion_embeddings,is_hit,mrr,retrieved,expected
0,8122327,241,"Benedict, D. J. Upon the testimony there- is ...",What are the contract terms determined by the ...,"[[-0.014296135, 0.023057982, -0.048878007, 0.0...","[[0.015871974, 0.018732293, -0.024121562, 0.06...",True,1.0,"[0, 266, 51, 266, 381]",0
1,2429336,1021,154 F. Supp. 2d 906 (2000) Margaret L. SUTPHIN...,What are the legal requirements for a claim of...,"[[0.010450049, -0.067059316, 0.002360706, 0.00...","[[0.0005678711, -0.0399566, -0.00022061473, 0....",True,1.0,"[1, 1, 1, 112, 15]",1
2,2939558,320,IN THE COURT OF CRIMINAL APPEALS OF TEXAS NO. ...,What factors are considered in determining ine...,"[[-0.001758455, -0.04744919, -0.021128822, -0....","[[0.0073035755, -0.02433301, 0.06404363, -2.13...",False,0.0,"[165, 165, 165, 425, 124]",2
3,4666766,84,DISTRICT COURT OF APPEAL OF THE STATE OF FLOR...,What were the grounds for Roger Fincher's appe...,"[[0.06206147, 0.039539136, -0.0043731686, -0.0...","[[0.026955968, -0.012011581, 0.02601028, -0.00...",True,1.0,"[3, 300, 55, 55, 428]",3
4,1359066,4985,703 P.2d 699 (1985) 108 Idaho 935 Marcilena SH...,What criteria did the Idaho Supreme Court use ...,"[[-0.0027054045, -0.031128146, -0.06615934, -0...","[[-0.03295222, 0.060800306, -0.06236514, -0.01...",True,1.0,"[4, 4, 4, 4, 4]",4


In [39]:
result.to_csv(f"{path}evals/{model.name}.csv", index=False)
len(result)

450

In [40]:
EVALS

{'bert-base-cased_triples': {'hit_rate': 0.6133333333333333,
  'mrr': 0.5171851851851852},
 'roberta-base_triples': {'hit_rate': 0.6911111111111111,
  'mrr': 0.5718148148148147},
 'mpnet-base_triples': {'hit_rate': 0.6977777777777778,
  'mrr': 0.5956666666666667},
 'ModernBERT-base_triples': {'hit_rate': 0.6555555555555556,
  'mrr': 0.5544074074074075},
 'kl3m-doc-pico-001_triples': {'hit_rate': 0.4066666666666667,
  'mrr': 0.29014814814814816},
 'kl3m-doc-nano-001_triples': {'hit_rate': 0.6266666666666667,
  'mrr': 0.5165185185185185},
 'kl3m-doc-micro-uncased-001_triples': {'hit_rate': 0.5733333333333334,
  'mrr': 0.4606296296296296},
 'kl3m-doc-small-uncased-001_triples': {'hit_rate': 0.5933333333333334,
  'mrr': 0.48740740740740734},
 'modernbert-embed-base': {'hit_rate': 0.8355555555555556,
  'mrr': 0.7802962962962963},
 'modernbert-embed-base_finetune_512': {'hit_rate': 0.8666666666666667,
  'mrr': 0.7909999999999999}}