In [1]:
%pip install transformers sentence-transformers datasets 



In [2]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader
import nltk
from nltk.tokenize import sent_tokenize
import tqdm

from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [30]:
# Load pretrained model 
model_ckpt = "msmarco-distilbert-base-v4"

sbert_model = SentenceTransformer(model_ckpt)

word_embedding_model = sbert_model._first_module()

tokens = ["[OFF] ", " [RN] ", " [CN] ", " [PCN] "]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))


Embedding(30526, 768)

In [33]:
sbert_model.tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


DistilBertTokenizerFast(name_or_path='C:\Users\shrin/.cache\torch\sentence_transformers\sentence-transformers_msmarco-distilbert-base-v4\', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30522: AddedToken("[OFF] ", rstrip=False

In [7]:
# Load dataset
dataset_id ="../data/processed/synthetic_pair/synthetic_search_data/" 
#dataset = load_dataset("shriadke/FetchSearch")  # shriadke/fetch-search-msmarco-distilbert-base-v4
dataset = load_from_disk(dataset_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 7314
    })
    val: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 2814
    })
    test: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 1125
    })
})

In [10]:
split_lengths = [len(dataset[split])for split in dataset]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset['train'].column_names}")
print("Offer Embeddings:")

print(dataset["test"][1]["offer_ext"])

print("Queries:")

print(dataset["test"][1]["search_query"])

Split lengths: [7314, 2814, 1125]
Features: ['offer_ext', 'search_query', 'score']
Offer Embeddings:
[OFF] thomas bagel thins buy 2 [BN] thomas [CN] bread [CN] frozen breakfast [CN] bakery [PCN] pantry [PCN] deli  bakery
Queries:
meals


In [79]:
# Prepare Train DataLoader
train_examples = []
train_data = dataset['train']

n_examples = dataset['train'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 7314 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [80]:
# Prepare Val DataLoader
val_examples = []
val_data = dataset['val']

n_examples = dataset['val'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = val_data[i]
  val_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(val_examples)} of length {len(val_examples)} containing {type(val_examples[0])}'s.")

We have a <class 'list'> of length 2814 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [81]:
# Prepare Test DataLoader
test_examples = []
test_data = dataset['test']

n_examples = dataset['test'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = test_data[i]
  test_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(test_examples)} of length {len(test_examples)} containing {type(test_examples[0])}'s.")

We have a <class 'list'> of length 1125 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [82]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=True, batch_size=16)

In [83]:
loss = losses.CosineSimilarityLoss(model=sbert_model)


In [84]:
# TRAINING ARGS
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
weight_decay = 0.01
output_path = "./models/"

In [85]:
val_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=val_examples)
test_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=test_examples)

In [86]:
# change test_dataloader while training
sbert_model.fit(train_objectives=[(test_dataloader, loss)], evaluator=val_evaluator, epochs = num_epochs, warmup_steps= warmup_steps, weight_decay=weight_decay, output_path= output_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

In [116]:
%huggingface-cli login

^C


In [117]:
pip install huggingface_hub





In [120]:
from huggingface_hub import login

In [121]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Store model to hub
model.save_to_hub(
    "shriadke/fetch-search-msmarco-distilbert-base-v4", 
    organization="",
    train_datasets=["shriadke/FetchSearch"],
    exist_ok=True, 
    )

In [6]:
import os

In [31]:
trained_sbert_model = SentenceTransformer("msmarco-distilbert-base-v4")

In [32]:
val_dataloader = torch.load("../artifacts/data_transformation/val.pth")
test_dataloader = torch.load("../artifacts/data_transformation/test.pth")

val_evaluator = torch.load("../artifacts/data_transformation/val_eval.pth")
test_evaluator = torch.load("../artifacts/data_transformation/test_eval.pth")

In [68]:
print(len(val_evaluator.sentences1),len(val_evaluator.sentences2),len(val_evaluator.scores))

4 4 4


In [None]:
# Here the bi-encoder will find top-k results and feed to cross encoder for fine-grain the results.
# That part is left for future work.
# Following is an example of evaluaiton of top result for the given query from val set with the help of the cross encoder.  

In [69]:
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
cross_scores = model.predict([[ doc, query] for query, doc in zip(val_evaluator.sentences1, val_evaluator.sentences2)])
# normalize to 0 to 1
rescaled_array = (cross_scores-np.min(cross_scores))/(np.max(cross_scores)-np.min(cross_scores))
rescaled_array = np.round(rescaled_array,2) 

In [70]:
errors = []
for i, score in enumerate(val_evaluator.scores):
    error = abs(score - rescaled_array[i] * 100)
    errors.append(error)
    #print(np_cos_scores, rescaled_array, score, error)
print("Val accuracy with cross-encoders : ", 100 - sum(errors)/len(errors))

Val accuracy with cross-encoders :  98.7500003632158


In [74]:
trained_sbert_model = SentenceTransformer("msmarco-distilbert-base-v4")

In [75]:
def clean_text(text):
    import re
    text = str(text).lower()
    text = re.sub('[-]+',' ', text)
    text = re.sub('[^A-Za-z0-9\[\]\s]+', '', text)   
    #text = text.strip()
    return text

In [76]:
synthetic_data_df = pd.read_csv("../data/processed/synthetic_pair/all_synth_data.csv")

In [84]:
doc_sents = synthetic_data_df["offer_ext"].unique().tolist()

In [96]:

doc_embd = trained_sbert_model.encode(doc_sents, show_progress_bar=True)


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [97]:
type(doc_embd)

numpy.ndarray

In [79]:
query = "sams club"
top_k = 1  
q_embeddings =  trained_sbert_model.encode([clean_text(query)], show_progress_bar=True)

hits = util.semantic_search(q_embeddings, doc_embd, top_k=top_k)
hits = hits[0] 

for hit in hits:
    print("\t{:.3f}\t{}".format(hit['score'], doc_sents[hit['corpus_id']]))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

	0.439	[OFF] tyson products select varieties spend 20 at sams club [BN] ball park frank [CN] packaged meat [CN] frozen beef [PCN] pantry [PCN] frozen meat


In [80]:
hits = util.semantic_search(q_embeddings, doc_embd, top_k=10)
hits = hits[0] 

for hit in hits:
    print("\t{:.3f}\t{}".format(hit['score'], doc_sents[hit['corpus_id']]))

	0.439	[OFF] tyson products select varieties spend 20 at sams club [BN] ball park frank [CN] packaged meat [CN] frozen beef [PCN] pantry [PCN] frozen meat
	0.438	[OFF] spend 50 on a full priced new club membership [BN] sams club
	0.405	[OFF] georges farmers market chicken wings at sams club [BN] georges farmers market
	0.400	[OFF] spend 110 on a full priced new plus membership and receive an additional 10000 points [BN] sams club
	0.199	[OFF] order online at zaxbyscom [BN] zaxbys
	0.193	[OFF] queen v the vip  the wingwoman soft touch silicone vibrator at walmart [BN] queen v
	0.179	[OFF] sign up for the club card or the club card full priced membership new members only [BN] bjs wholesale [CN] cooking  baking [PCN] pantry
	0.116	[OFF] hagen dazs 28 ounce at grocery stores [BN] haagen dazs [CN] frozen desserts
	0.116	[OFF] butterball select varieties spend 10 at king soopers [BN] butterball [CN] nut butters  jam [CN] frozen turkey [PCN] pantry [PCN] frozen meat
	0.108	[OFF] coors light m

In [115]:
## The model is just trained on a smaller data size and was not curated on official score labels.
## thus it may produce same hit score for all the queries, but internally, it will try to find most plausible hit irrespective of the score.

In [109]:
import pickle
with open('../data/processed/embeddings/msmacro_sent_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_offers = stored_data['offers']
    stored_sentences = stored_data['offer_processed']
    stored_embeddings = stored_data['embeddings']

In [140]:

query = "sams club"
top_k = 1  
q_embeddings =  trained_sbert_model.encode([clean_text(query)], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [162]:
hits = util.semantic_search(q_embeddings, stored_embeddings, top_k=10)
hits = hits[0] 

offers = []
scores = []
for hit in hits:
    offer = stored_offers[hit['corpus_id']]
    score = hit['score']
    offers.append(offer)    
    scores.append(score)



In [163]:
print(offers)
print(cross_list)
print(scores)

["Tyson Products, select varieties, spend $20 at Sam's Club", "Tyson Products, select varieties, spend $20 at Sam's Club", "George's Farmers Market Chicken Wings, at Sam's Club", 'Spend $50 on a Full-Priced new Club Membership', 'Spend $110 on a Full-Priced new Plus Membership and receive an ADDITIONAL 10,000 points', 'Sign up for The Club Card or The Club+ Card full-priced membership* (New Members Only)', 'Order online at Zaxbys.com', 'QUEEN V® The VIP & The Wingwoman Soft Touch Silicone Vibrator at Walmart', "Wings OR Cheesy Breadsticks at Casey's", "Wings OR Cheesy Breadsticks at Casey's"]
[['[OFF] tyson products select varieties spend 20 at sams club [RN] sams club [BN] ball park frank [CN] packaged meat [PCN] pantry [RCN] cooking  baking packaged seafood nut butters  jam cereal granola  toaster pastries condiments packaged meals  sides soup  broth bread sauces  marinades packaged fruit  applesauce pickled goods dressings packaged meat pasta  noodles packaged vegetables rice  grain

In [143]:
cross_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [144]:
remove_idx = set()
for i in range(len(offers)):
    for j in range(i+1, len(offers)):
        pair = [offers[i],offers[j]]
        
        cross_score = np.round(torch.sigmoid(torch.from_numpy(np.array(cross_model.predict(pair)))),3)
        #print(str(i), str(j), pair, cross_score)
        if cross_score == 1:
            #print("to be romoved")
            remove_idx.add(j)
print(remove_idx)

{1, 9}


In [152]:
cross_list = []
filtered_hits = []
for idx in range(len(hits)):
    if idx not in remove_idx:
        cross_list.append([stored_sentences[hits[idx]['corpus_id']], query])
        filtered_hits.append(hits[idx])


In [187]:
ce_scores = cross_model.predict(cross_list)
for idx in range(len(filtered_hits)):
    filtered_hits[idx]['cross-encoder_score'] = np.round(torch.sigmoid(torch.from_numpy(np.array(ce_scores[idx]))).numpy().item(),4) * 100

In [188]:
filtered_hits

[{'corpus_id': 0, 'score': 0.4296583831310272, 'cross-encoder_score': 1.02},
 {'corpus_id': 616, 'score': 0.3482372760772705, 'cross-encoder_score': 23.11},
 {'corpus_id': 103, 'score': 0.5765717029571533, 'cross-encoder_score': 1.71},
 {'corpus_id': 232,
  'score': 0.4511070251464844,
  'cross-encoder_score': 1.0999999999999999},
 {'corpus_id': 182, 'score': 0.2132781594991684, 'cross-encoder_score': 0.04},
 {'corpus_id': 318, 'score': 0.15760967135429382, 'cross-encoder_score': 0.0},
 {'corpus_id': 73, 'score': 0.18093356490135193, 'cross-encoder_score': 0.02},
 {'corpus_id': 152, 'score': 0.17803266644477844, 'cross-encoder_score': 0.02}]

In [175]:
#Sort list by CrossEncoder scores
filtered_hits = sorted(filtered_hits, key=lambda x: x['cross-encoder_score'], reverse=True)
print("\nRe-ranking with Cross-Encoder took seconds")
print("Top 5 hits with CrossEncoder:")
offers = []
scores = []
for hit in filtered_hits:
    print("\t{:.3f}\t{}".format(hit['cross-encoder_score'], stored_offers[hit['corpus_id']]))
    offers.append(stored_offers[hit['corpus_id']])
    scores.append(hit['cross-encoder_score'])


Re-ranking with Cross-Encoder took seconds
Top 5 hits with CrossEncoder:
	23.110	Spend $50 on a Full-Priced new Club Membership
	1.710	Spend $110 on a Full-Priced new Plus Membership and receive an ADDITIONAL 10,000 points
	1.100	Tyson Products, select varieties, spend $20 at Sam's Club
	1.020	George's Farmers Market Chicken Wings, at Sam's Club
	0.040	Sign up for The Club Card or The Club+ Card full-priced membership* (New Members Only)
	0.020	Wings OR Cheesy Breadsticks at Casey's
	0.020	Order online at Zaxbys.com
	0.000	QUEEN V® The VIP & The Wingwoman Soft Touch Silicone Vibrator at Walmart


In [176]:
output_df = pd.DataFrame({
            "Offer" : offers,
            "Relevance" : scores
        })
output_df.head()

Unnamed: 0,Offer,Relevance
0,Spend $50 on a Full-Priced new Club Membership,23.109999
1,Spend $110 on a Full-Priced new Plus Membershi...,1.71
2,"Tyson Products, select varieties, spend $20 at...",1.1
3,"George's Farmers Market Chicken Wings, at Sam'...",1.02
4,Sign up for The Club Card or The Club+ Card fu...,0.04
