In [1]:
from urllib.request import urlopen
import numpy as np

response = urlopen('''https://www.gutenberg.org/cache/epub/10834/pg10834.txt''')  # insects

PARAGRAPH_SPLITTER = '\r\n\r\n'

text = response.read().decode()

text = text[text.index("***START OF THE PROJECT GUTENBERG") :text.index("***END OF THE PROJECT GUTENBERG")]

documents = text.split(PARAGRAPH_SPLITTER)

documents = list(filter(lambda x: len(x) > 25, documents))

def preprocess(s):
    return s.strip()

documents =  list(map(preprocess, documents))

documents = np.array(documents)

print(f'There are {len(documents)} documents')


There are 50 documents


In [2]:
from random import sample, seed

seed(42)

sample(sorted(documents), 1)


["This cut shews the appearance of the worm, which at first is very small\r\nand black. Its food is the leaves of the white mulberry: as it grows in\r\nsize, at four different periods, it apparently sickens, and changes its\r\nskin, and finally, when full grown, it spins a ball of silk, called a\r\ncone, or cocoon, the thread of which is about three hundred yards long:\r\nin the centre of this ball the worm entombs itself, and experiences a\r\nchange to a state called an aurelia, or chrysallis, as seen below the\r\nball: from this aurelia, the moth that lays the eggs is hatched, and\r\nthus goes on the round of this animal's changes, or transmigrations."]

In [3]:
from sentence_transformers import SentenceTransformer, util

# a model pre-trained on an asymmetric semantic search task
sbert_model = SentenceTransformer('msmarco-distilbert-base-v4')

# Documents are encoded by calling model.encode()
document_embeddings = sbert_model.encode(documents)

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
document_embeddings.shape

In [None]:
sbert_model

In [None]:
sbert_model.encode(['hi'])[0][:5]

In [None]:
QUESTION = 'How many horns does a flea have?'  # a natural language query

query_embedding = sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document


In [None]:
top_scores.sort()

In [None]:
top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

In [None]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

bert_tokenizer = DistilBertTokenizer.from_pretrained('bert-base-cased')  # distilbert doesn't have token type IDs
qa_bert = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')


# Using a pretrained model to fine-tune our answers

In [None]:
from transformers import pipeline

qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='distilbert-base-uncased')


In [None]:
top_documents[0]

In [None]:
qa(question=QUESTION, context=top_documents[0])

# Using our pretrained model from the last use-case

In [None]:
from transformers import BertForQuestionAnswering

qa_bert_finetuned = BertForQuestionAnswering.from_pretrained('./qa/results')

finetuned_qa = pipeline('question-answering', model=qa_bert_finetuned, tokenizer='bert-large-uncased')


In [None]:
finetuned_qa(question=QUESTION, context=top_documents[0])

In [None]:
# https://www.sbert.net/docs/pretrained_models.html for more fine-tuned models

In [None]:
from datasets import load_dataset

# load up the adversarial_qa dataset from the last use-case
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train')

good_training_data = []
bad_training_data = []
    
last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0.0))  #  add bad examples
    # question, context, label is 1 for these should be matched together
    good_training_data.append((example['question'], example['context'], 1.0))
    last_example = example

In [None]:
len(good_training_data), len(bad_training_data)

In [None]:
good_training_data[0]

In [None]:
bad_training_data[0]

In [None]:
# https://www.sbert.net/docs/training/overview.html for information on training

from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import sample, seed, shuffle

seed(42)  # seed our upcoming sample

sampled_training_data = sample(good_training_data, 500) + sample(bad_training_data, 500)

shuffle(sampled_training_data)

#Define the training examples
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:800]]


#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(sbert_model)

# Evaluation data
sentences1, sentences2, scores = zip(*sampled_training_data[800:])

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)


In [None]:
# Tune the model
sbert_model.fit(
    train_objectives=[(train_dataloader, train_loss)], output_path='ir/results',
    epochs=2, warmup_steps=50, 
    evaluator=evaluator, evaluation_steps=5
)

In [None]:
# load fine-tuned IR model
finetuned_sbert_model = SentenceTransformer('ir/results')

finetuned_sbert_model.encode(['hi'])[0][:5]  # different embedding as before which is expected


In [None]:
# re-encode the documents and run the same question as before
document_embeddings = finetuned_sbert_model.encode(documents)

query_embedding = sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document

top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

In [None]:
def gutenberg_to_documents(url, sbert_model):
    response = urlopen(url)

    PARAGRAPH_SPLITTER = '\r\n\r\n'

    text = response.read().decode()
    try:
        text = text[text.index("***START OF THE PROJECT GUTENBERG") :text.index("***END OF THE PROJECT GUTENBERG")]
    except:
        pass

    documents = text.split(PARAGRAPH_SPLITTER)

    documents = list(filter(lambda x: len(x) > 25, documents))

    def preprocess(s):
        return s.strip()

    documents =  list(map(preprocess, documents))

    documents = np.array(documents)

    print(f'There are {len(documents)} documents')
    
    return documents, sbert_model.encode(documents)


In [None]:
def retrieve_relevant_documents(sbert_model, query, documents, embeddings, qa=None):
    query_embedding = sbert_model.encode(query)  # embed the query into a vector space

    top_scores = util.cos_sim(query_embedding, embeddings)  # use cosine similarity to find the most relevant document
    top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
    top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

    for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
        print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
        if qa:
            answer = qa(question=query, context=top_document)
            print(f'\nAnswer: {answer}\n')
        print('\n')

In [None]:
banks_to_bassoon_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt', finetuned_sbert_model
)

In [None]:
retrieve_relevant_documents(finetuned_sbert_model,
    'What is a banshee?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    qa=qa
)

In [None]:
retrieve_relevant_documents(finetuned_sbert_model,
    'Where do you usually play amateur base ball?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    qa=qa
)