# Load our Example Embedding Model

In [1]:
# prompt: import bert base embedding model from hugging face

from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example usage:
text = "This is a sample sentence."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

# The output is a dictionary containing 'last_hidden_state' and 'pooler_output'
last_hidden_states = output.last_hidden_state
pooler_output = output.pooler_output

print("Last hidden states shape:", last_hidden_states.shape)
print("Pooler output shape:", pooler_output.shape)


2025-09-13 02:09:47.687046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-13 02:09:47.938457: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-13 02:09:48.016127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-13 02:09:48.512492: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'



AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Last hidden states shape: torch.Size([1, 8, 768])
Pooler output shape: torch.Size([1, 768])


In [2]:
# prompt: now compuse cosine similarity between three texts (1 relevant pair and 1 irrelevant text)

from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(text):
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  with torch.no_grad():
    output = model(**encoded_input)
  # Use the CLS token embedding as the sentence embedding
  sentence_embedding = output.last_hidden_state[:, 0, :].numpy()
  return sentence_embedding

# Define the texts
text1 = "The weather is nice today."
text2 = "It's a beautiful day outside."
text3 = "The stock market crashed yesterday."

# Get embeddings for each text
embedding1 = get_embedding(text1)
embedding2 = get_embedding(text2)
embedding3 = get_embedding(text3)

# Calculate cosine similarity
similarity_1_2 = cosine_similarity(embedding1, embedding2)[0][0]
similarity_1_3 = cosine_similarity(embedding1, embedding3)[0][0]
similarity_2_3 = cosine_similarity(embedding2, embedding3)[0][0]

print(f"Cosine similarity between text1 and text2: {similarity_1_2:.4f}")
print(f"Cosine similarity between text1 and text3: {similarity_1_3:.4f}")
print(f"Cosine similarity between text2 and text3: {similarity_2_3:.4f}")

Cosine similarity between text1 and text2: 0.9522
Cosine similarity between text1 and text3: 0.8442
Cosine similarity between text2 and text3: 0.8108


# Set OPENAI API KEY

In [1]:
import os
os.environ['OPENAI_API_KEY'] = "sk-svcacct-jd_qsygf8C4wNycwAwrjfBAApSasg72tP2FGTpHPGJHbZb0tCtvS19n0dg4fT3BlbkFJBlb-yNLhygEwdu4OiRCxwA2aJ6KF9OY49BEYS2htqzX_4EwCghBpBpjQQTUA"


In [2]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import checklist_plus
from checklist_plus.editor import Editor
from checklist_plus.perturb import LLMPerturb

In [3]:
checklist_plus.__version__

'0.2.0'

In [3]:
llm_editor = Editor(
             use_llm=True,
            model_name='gpt-4o-mini'
            )

## Generate Examples data

In [5]:
ret = llm_editor.template('The football game was very good, I especially liked {mask}', context="different experiences in football games", remove_duplicates=True, n_completions=100)
original_texts = ret.data

In [6]:
original_texts = list(set(original_texts))
print(original_texts[:5])

["The football game was very good, I especially liked match's highlights", "The football game was very good, I especially liked defender's block", "The football game was very good, I especially liked goal's significance", "The football game was very good, I especially liked goalpost's impact", "The football game was very good, I especially liked game's review"]


In [7]:
len(original_texts)

94

## Paraphrase Example Data

In [8]:
ret = llm_editor.paraphrase_llm(original_texts, n_paraphrases=1, length_preference='similar')

In [9]:
paraphrased_texts = ret.data

In [10]:
assert len(paraphrased_texts) == len(original_texts)

In [11]:
paraphrased_texts[:5]

['The soccer match was excellent; I particularly enjoyed the highlights of the game.',
 "The soccer match was quite impressive; I particularly enjoyed the defender's tackle.",
 'The soccer match was quite enjoyable; I particularly appreciated the importance of the goal.',
 'The soccer match was excellent, and I particularly appreciated the influence of the goalposts.',
 'The soccer match was excellent, and I particularly enjoyed the analysis of the game.']

## Negate Example Data

In [12]:
perturb = LLMPerturb()

In [13]:
ret = perturb.add_negation_llm(original_texts, n_variations=1)

In [14]:
negated_texts = [x[0] for x in ret]

In [15]:
negated_texts[:5]

["The football game was not very good; I did not especially like the match's highlights.",
 "The football game was not very good; I did not especially like the defender's block.",
 'The football game was not very good, and I did not like the significance of the goal.',
 "The football game was not very good; I did not like the goalpost's impact at all.",
 "The football game was not very good; I did not especially like the game's review."]

# Perform Simple MFT test


In [24]:
from checklist_plus.test_types import MFT, INV, DIR
from checklist_plus.expect import Expect

In [25]:
# expect original text is more similar to the paraphrased one
def similar_paraphrase(x, pred, conf, label=None, meta=None):
    return pred == 0
expect_fn = Expect.single(similar_paraphrase)

In [26]:
test = MFT(list(zip(original_texts, paraphrased_texts, negated_texts)), expect=expect_fn, name='Simple negation',
           capability='Negation', description='Very simple negations.')

In [27]:
import numpy as np
def get_cosine_similarities(data):
  similarities = []
  for original, paraphrased, negated in data:
    original_embedding = get_embedding(original)
    paraphrased_embedding = get_embedding(paraphrased)
    negated_embedding = get_embedding(negated)

    sim_paraphrased = cosine_similarity(original_embedding, paraphrased_embedding)[0][0]
    sim_negated = cosine_similarity(original_embedding, negated_embedding)[0][0]

    similarities.append([sim_paraphrased, sim_negated])
  similarities = np.array(similarities)
  return np.argmax(similarities, axis=-1), similarities

cosine_sims = get_cosine_similarities(list(zip(original_texts, paraphrased_texts, negated_texts))[:5])

In [28]:
print(cosine_sims[:5])


(array([1, 1, 1, 1, 1]), array([[0.9449084 , 0.94528353],
       [0.94997364, 0.95709735],
       [0.9254709 , 0.9529028 ],
       [0.8882368 , 0.8951863 ],
       [0.9117462 , 0.92221034]], dtype=float32))


In [29]:
test.run(get_cosine_similarities)

Predicting 99 examples


In [30]:
# bert-base-uncased is not sensitive to negations
test.summary()

Test cases:      99
Fails (rate):    86 (86.9%)

Example fails:
0.9 ("The football game was very good, I especially liked midfielder's pass", "The soccer match was excellent; I particularly appreciated the midfielder's assist.", "The football game was not very good, and I did not like the midfielder's pass at all.")
----
0.9 ("The football game was very good, I especially liked player's creativity", 'The soccer match was excellent, and I particularly appreciated the creativity displayed by the players.', "The football game was not very good; I did not like the player's creativity at all.")
----
0.9 ("The football game was very good, I especially liked fan's celebration", 'The soccer match was excellent, and I particularly enjoyed how the fans celebrated.', "The football game was not good, and I did not particularly like the fans' celebration.")
----


# Let's test whether embedding model understands sizes in our marketplace

## Generate data

In [16]:
ret = llm_editor.template('red {mask} shoes US 11 size ', context="different shoe brands", remove_duplicates=True, n_completions=100)
user_queries = ret.data

In [17]:
user_queries = list(set(user_queries))
print(user_queries[:5])

['red Nike shoes US 11 size ', 'red Brooks Ghost shoes US 11 size ', 'red Asics Gel shoes US 11 size ', 'red Louboutin Louis shoes US 11 size ', 'red Adidas shoes US 11 size ']


In [18]:
len(user_queries)

80

In [19]:
ret = llm_editor.paraphrase_llm(user_queries, n_paraphrases=1, length_preference='longer')

In [20]:
item_titles = ret.data
print(item_titles[:5])

['A pair of size 11 red Nike sneakers from the United States.', 'Size 11 in the United States for the red Brooks Ghost running shoes.', 'A pair of size 11 US red Asics Gel sneakers.', 'A pair of red Louis Vuitton shoes in a size 11 for men in the United States.', 'A pair of red Adidas sneakers in a size 11 for men in the United States.']


In [21]:
assert len(item_titles) == len(user_queries)

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [23]:
ret = LLMPerturb.perturb(list(nlp.pipe(item_titles)), LLMPerturb.change_number, n=1, keep_original=False)
item_titles_negative = [o[0] for o in ret.data]

In [24]:
assert len(item_titles) == len(item_titles_negative)

In [25]:
print(user_queries[:3], item_titles_negative[:3], item_titles[:3])

['red Nike shoes US 11 size ', 'red Brooks Ghost shoes US 11 size ', 'red Asics Gel shoes US 11 size '] ['A pair of size 9 red Nike sneakers from the United States.', 'Size 12 in the United States for the red Brooks Ghost running shoes.', 'A pair of size 14 US red Asics Gel sneakers.'] ['A pair of size 11 red Nike sneakers from the United States.', 'Size 11 in the United States for the red Brooks Ghost running shoes.', 'A pair of size 11 US red Asics Gel sneakers.']


## Run the test

In [187]:
test = MFT(list(zip(user_queries, item_titles, item_titles_negative)), expect=expect_fn, name='Size change',
           capability='Size', description='Very simple size understanding check.')

In [188]:
test.run(get_cosine_similarities)

Predicting 100 examples


In [189]:
# bert-base-uncased fails 45% of the times even when texts are identical
test.summary()

Test cases:      100
Fails (rate):    45 (45.0%)

Example fails:
0.9 ("red Palladium's shoes US 11 size ", 'A pair of red Palladium sneakers in size US 11.', 'A pair of red Palladium sneakers in size US 10.')
----
0.9 ("red Salomon's shoes US 11 size ", 'A pair of red Salomon shoes in size US 11.', 'A pair of red Salomon shoes in size US 12.')
----
0.9 ('red Etnies shoes US 11 size ', 'Crimson Etnies footwear in a US size 11.', 'Crimson Etnies footwear in a US size 13.')
----


In [13]:
ret = llm_editor.template('{mask} phone color of {mask}', context="mask1 is for brands and mask2 is for colors", remove_duplicates=True, n_completions=100)
user_queries = ret.data

In [14]:
user_queries = list(set(user_queries))
print(user_queries[:5])

['HTC phone color of khaki', 'OnePlus phone color of citrine', 'Xiaomi phone color of jade', 'HTC phone color of gold', 'Sony phone color of white']


In [22]:
len(user_queries)

85

In [15]:
examples = [
    ("Motorola phone color crimson", "Motorola phone of a rich deep red colour"),
    ("Sharp phone color coffee", "Sharp phone of a deep brown colour"),
    ("Samsung phone color white", "Samsung phone of a pure bright colour"),
]
ret = llm_editor.paraphrase_llm(user_queries, n_paraphrases=1, length_preference='similar', context="only colors", examples=examples)
item_titles = ret.data
print(item_titles[:5])

Paraphrase prompt template:
You are a paraphrasing expert. Generate paraphrases of the given text.
Focus on paraphrasing text related to: {context}. Try to use terminology and phrasing common in this domain.
Generate exactly {n_paraphrases} unique paraphrases that preserve the original meaning while using different words and sentence structures.
Each paraphrase should be:
- Each paraphrase must preserve the original meaning
- Use different vocabulary and sentence structures
- Preserve context and domain relevance
- Make each paraphrase unique and natural
- {length_instruction}

Here are some examples of how to respond:

Motorola phone color crimson -> Motorola phone of a rich deep red colour

Sharp phone color coffee -> Sharp phone of a deep brown colour

Samsung phone color white -> Samsung phone of a pure bright colour


Text to paraphrase: {text}
Consider synonyms, sentence restructuring, and varying lengths before you respond.
Provide the paraphrases as a list. Example:
["Paraphras

['HTC phone in a muted olive hue', 'OnePlus phone in a vibrant yellow hue', 'Xiaomi phone in a shade of green resembling jade', 'HTC phone in a luxurious shade of gold', 'Sony phone in a brilliant ivory shade']


In [16]:
ret = llm_editor.detect_and_mask_entities_llm(user_queries, entity_type="COLORS", mask_token="")

In [24]:
item_titles_no_colors = [o["masked_text"] for o in ret.data if o["contains_entities"]]
data = []
for idx, o in enumerate(ret.data):
    if o["contains_entities"]:
        data.append((user_queries[idx], item_titles[idx], o["masked_text"]))

In [25]:
len(data)

81

In [26]:
data[:3]

[('HTC phone color of khaki',
  'HTC phone in a muted olive hue',
  'HTC phone color of '),
 ('OnePlus phone color of citrine',
  'OnePlus phone in a vibrant yellow hue',
  'OnePlus phone color of '),
 ('Xiaomi phone color of jade',
  'Xiaomi phone in a shade of green resembling jade',
  'Xiaomi phone color of ')]