In [1]:
!pip install transformers sentence-transformers datasets 



In [90]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader
import nltk
from nltk.tokenize import sent_tokenize
import tqdm

from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [30]:
# Load pretrained model 
model_ckpt = "msmarco-distilbert-base-v4"

sbert_model = SentenceTransformer(model_ckpt)

word_embedding_model = sbert_model._first_module()

tokens = ["[OFF] ", " [RN] ", " [CN] ", " [PCN] "]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))


Embedding(30526, 768)

In [33]:
sbert_model.tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


DistilBertTokenizerFast(name_or_path='C:\Users\shrin/.cache\torch\sentence_transformers\sentence-transformers_msmarco-distilbert-base-v4\', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30522: AddedToken("[OFF] ", rstrip=False

In [7]:
# Load dataset
dataset_id ="../data/processed/synthetic_pair/synthetic_search_data/" 
#dataset = load_dataset("shriadke/FetchSearch")  # shriadke/fetch-search-msmarco-distilbert-base-v4
dataset = load_from_disk(dataset_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 7314
    })
    val: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 2814
    })
    test: Dataset({
        features: ['offer_ext', 'search_query', 'score'],
        num_rows: 1125
    })
})

In [10]:
split_lengths = [len(dataset[split])for split in dataset]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset['train'].column_names}")
print("Offer Embeddings:")

print(dataset["test"][1]["offer_ext"])

print("Queries:")

print(dataset["test"][1]["search_query"])

Split lengths: [7314, 2814, 1125]
Features: ['offer_ext', 'search_query', 'score']
Offer Embeddings:
[OFF] thomas bagel thins buy 2 [BN] thomas [CN] bread [CN] frozen breakfast [CN] bakery [PCN] pantry [PCN] deli  bakery
Queries:
meals


In [79]:
train_examples = []
train_data = dataset['train']

n_examples = dataset['train'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 7314 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [80]:
val_examples = []
val_data = dataset['val']

n_examples = dataset['val'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = val_data[i]
  val_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(val_examples)} of length {len(val_examples)} containing {type(val_examples[0])}'s.")

We have a <class 'list'> of length 2814 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [81]:
test_examples = []
test_data = dataset['test']

n_examples = dataset['test'].num_rows# // 2# For agility we only 1/2 of our available data

for i in range(n_examples):
  example = test_data[i]
  test_examples.append(InputExample(texts=[example['offer_ext'], example['search_query']], label=float(example["score"])))
print(f"We have a {type(test_examples)} of length {len(test_examples)} containing {type(test_examples[0])}'s.")

We have a <class 'list'> of length 1125 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [82]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=True, batch_size=16)

In [83]:
loss = losses.CosineSimilarityLoss(model=sbert_model)


In [84]:
# TRAINING ARGS
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
weight_decay = 0.01
output_path = "./models/"

In [85]:
val_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=val_examples)
test_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=test_examples)

In [86]:
# change test_dataloader while training
sbert_model.fit(train_objectives=[(test_dataloader, loss)], evaluator=val_evaluator, epochs = num_epochs, warmup_steps= warmup_steps, weight_decay=warmup_steps, output_path= output_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

In [116]:
!huggingface-cli login

^C


In [117]:
pip install huggingface_hub





In [120]:
from huggingface_hub import login

In [121]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.save_to_hub(
    "shriadke/fetch-search-msmarco-distilbert-base-v4", 
    organization="",
    train_datasets=["shriadke/FetchSearch"],
    exist_ok=True, 
    )

In [87]:
trained_sbert_model = SentenceTransformer(output_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [88]:
def clean_text(text):
    import re
    text = str(text).lower()
    text = re.sub('[-]+',' ', text)
    text = re.sub('[^A-Za-z0-9\[\]\s]+', '', text)   
    #text = text.strip()
    return text

In [93]:
synthetic_data_df = pd.read_csv("../data/processed/synthetic_pair/all_synth_data.csv")

In [94]:
doc_sents = synthetic_data_df["offer_ext"].unique().tolist()

In [96]:

doc_embd = trained_sbert_model.encode(doc_sents, show_progress_bar=True)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [114]:
query = "sams club"
top_k = 1  
q_embeddings =  trained_sbert_model.encode([clean_text(query)], show_progress_bar=True)

hits = util.semantic_search(q_embeddings, doc_embd, top_k=top_k)
hits = hits[0] 

for hit in hits:
    print("\t{:.3f}\t{}".format(hit['score'], doc_sents[hit['corpus_id']]))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

	1.000	[OFF] spend 50 on a full priced new club membership [BN] sams club


In [115]:
## The model is just trained on a smaller data size and was not curated on official score labels.
## thus it may produce same hit score for all the queries, but internally, it will try to find most plausible hit irrespective of the score.