In [1]:
from sentence_transformers import SentenceTransformer, util

# a model pre-trained on an asymmetric semantic search task
sbert_model = SentenceTransformer('msmarco-distilbert-base-v4')

In [2]:
sbert_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [3]:
sbert_model.encode(['hi'])[0][:5]

array([ 0.40406635, -0.16512747, -0.09304258, -0.1626442 , -0.69411176],
      dtype=float32)

In [4]:
from transformers import pipeline

qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='distilbert-base-uncased')


In [5]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from collections import Counter
import re

def url_to_documents(urls, sbert_model, MIN_LENGTH=100, splitter='auto'):
    if type(urls) == str:
        urls = [urls]
        
    documents = []
    
    for url in urls:
        response = requests.get(url).text

        text = BeautifulSoup(response, 'html.parser').get_text()
        
        if splitter == 'auto':
            most_common = Counter(list(filter(lambda x:x !=' ', re.findall('[\s]+', text)))).most_common(5)
            splitter = most_common[0][0]
            print(f'Using {most_common} as splitter')

        documents += list(filter(lambda x: len(x) > MIN_LENGTH, text.split(splitter)))

    def preprocess(s):
        return s.strip()

    documents =  list(map(preprocess, documents))

    documents = np.array(documents)

    print(f'There are {documents.shape[0]} documents')
    
    return documents, sbert_model.encode(documents)

def retrieve_relevant_documents(sbert_model, query, documents, embeddings, qa=None):
    query_embedding = sbert_model.encode(query)  # embed the query into a vector space

    top_scores = util.cos_sim(query_embedding, embeddings)  # use cosine similarity to find the most relevant document
    top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
    top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

    for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
        print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
        if qa:
            answered = qa(question=query, context=top_document)
            answer, score = answered['answer'].strip(), answered['score']
            print(f'\nAnswer: {answer} Score: {score:.2f}\n')
        print('\n')

In [6]:
cran_doc, cran_emb = url_to_documents(
    'https://www.cranburyhistory.org/about-cranbury-nj', sbert_model, splitter='auto'
)

retrieve_relevant_documents(sbert_model,
    'When was cranbury, New Jersey founded?', cran_doc, cran_emb,
    qa=qa
)

Using [('\n', 73), ('\n\xa0\n', 5), (' \n\xa0\n', 3), ('\xa0', 2), ('\xa0 ', 2)] as splitter
There are 36 documents
Top Document 1 Cos_Sim 0.740:

Cranbury is one of the oldest towns in New Jersey. Settlers came from France, England, Scotland, Holland, Germany, and Norway, perhaps as early as 1680. However, the first recorded evidence of buildings in Cranbury is March 1, 1698, on a deed of sale between Josia Prickett of Burlington and John Harrison for land "with all improvements." Around the same date, John Harrison also received a license to buy more land from the local Lenape Indians, a Delaware tribe.

Answer: 1680 Score: 0.25



Top Document 2 Cos_Sim 0.684:

The Second Presbyterian Church of Cranbury was founded in 1838. In 1935 the First and Second Church congregations were joined. The Second Church sanctuary was razed and a monument erected on the site. The sexton's house remains as a residence.

Answer: 1838 Score: 0.99



Top Document 3 Cos_Sim 0.682:

The Cranbury Public Lib

In [7]:
doc, emb = url_to_documents(
    'https://www.gutenberg.org/files/17087/17087-0.txt', sbert_model, splitter='\r\n\r\n'
)

There are 424 documents


In [8]:
retrieve_relevant_documents(sbert_model,'What is the purpose of art?', doc, emb, qa=qa)

Top Document 1 Cos_Sim 0.644:

One function, then, of art is to feed and nurture the imagination and
the spirit, and thereby enhance and invigorate the whole of human life.
This is far removed from the view that the end of art is to give
pleasure. Art does usually cause pleasure, singular and intense, and to
that which causes such pleasure we give the name of Beauty. But to
produce and enjoy Beauty is not the function of art. Beauty--or rather,
the sensation of Beauty--is what the Greeks would call an _epigignomenon
ti telos_, words hard to translate, something between a by-product and a
supervening perfection, a thing like--as Aristotle[54] for once
beautifully says of pleasure--"the bloom of youth to a healthy young
body."

Answer: to feed and nurture the imagination and
the spirit Score: 0.26



Top Document 2 Cos_Sim 0.614:

But, though the artist's vision and emotion alike are modified,
purified, they are not devitalized. Far from that, by detachment from
action they 

In [9]:
from datasets import load_dataset

# load up the adversarial_qa dataset from the last use-case
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train')

good_training_data = []
bad_training_data = []
    
last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0.0))  #  add bad examples
    # question, context, label is 1 for these should be matched together
    good_training_data.append((example['question'], example['context'], 1.0))
    last_example = example

Reusing dataset adversarial_qa (/Users/sinanozdemir/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)


In [10]:
len(bad_training_data)

2647

In [11]:
len(good_training_data)

30000

In [16]:
# https://www.sbert.net/docs/training/overview.html for information on training

from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import sample, seed, shuffle

seed(42)  # seed our upcoming sample

sampled_training_data = sample(good_training_data, 2500) + sample(bad_training_data, 2500)

shuffle(sampled_training_data)

eighty_index = int(.8 * len(sampled_training_data))

#Define the training examples
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:eighty_index]]


#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(sbert_model)

# Evaluation data
sentences1, sentences2, scores = zip(*sampled_training_data[eighty_index:])

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
# evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)


In [17]:
sbert_model.evaluate(evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores))

0.7905592018146427

In [18]:
# Tune the model
sbert_model.fit(
    train_objectives=[(train_dataloader, train_loss)], output_path='ir/results',
    epochs=3, warmup_steps=len(train_examples) // 5, 
    evaluator=evaluator, evaluation_steps=5
)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/125 [00:00<?, ?it/s]

Iteration:   0%|          | 0/125 [00:00<?, ?it/s]

Iteration:   0%|          | 0/125 [00:00<?, ?it/s]

In [19]:
sbert_model.evaluate(evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores))

0.8149263639975689

In [20]:
# load fine-tuned IR model
finetuned_sbert_model = SentenceTransformer('ir/results')

finetuned_sbert_model.encode(['hi'])[0][:5]  # different embedding as before which is expected


array([ 0.3870701 , -0.17139037,  0.00948946, -0.13145085, -0.77764136],
      dtype=float32)

In [21]:
cran_doc, cran_emb = url_to_documents(
    'https://www.cranburyhistory.org/about-cranbury-nj', finetuned_sbert_model, splitter='\n\u200b\n'
)

retrieve_relevant_documents(sbert_model, 'When was cranbury, new jersey founded?', cran_doc, cran_emb, qa=qa)

There are 30 documents
Top Document 1 Cos_Sim 0.760:

Cranbury is one of the oldest towns in New Jersey. Settlers came from France, England, Scotland, Holland, Germany, and Norway, perhaps as early as 1680. However, the first recorded evidence of buildings in Cranbury is March 1, 1698, on a deed of sale between Josia Prickett of Burlington and John Harrison for land "with all improvements." Around the same date, John Harrison also received a license to buy more land from the local Lenape Indians, a Delaware tribe.

Answer: 1680 Score: 0.25



Top Document 2 Cos_Sim 0.752:

The Baptist Church in Cranbury was founded in 1745 with John Hight (Hightstown) as one of the 17 original members. The first meeting house was built in 1748 on property occupied later by the Spice Mill. The church was used for 40 years, then sold to Dr.Stites and moved. The congregation moved to Hightstown in 1785. The front part of the church grounds was sold and the burial grounds by 1882 were neglected and contain