In [59]:
from urllib.request import urlopen
import numpy as np

response = urlopen('''https://www.gutenberg.org/cache/epub/10834/pg10834.txt''')  # insects

PARAGRAPH_SPLITTER = '\r\n\r\n'

text = response.read().decode()

text = text[text.index("***START OF THE PROJECT GUTENBERG") :text.index("***END OF THE PROJECT GUTENBERG")]

documents = text.split(PARAGRAPH_SPLITTER)


documents = list(filter(lambda x: len(x) > 25, documents))

def preprocess(s):
    return s.strip()

documents =  list(map(preprocess, documents))

documents = np.array(documents)

print(f'There are {len(documents)} documents')


There are 50 documents


In [60]:
from random import sample, seed

seed(42)

sample(sorted(documents), 1)


["This cut shews the appearance of the worm, which at first is very small\r\nand black. Its food is the leaves of the white mulberry: as it grows in\r\nsize, at four different periods, it apparently sickens, and changes its\r\nskin, and finally, when full grown, it spins a ball of silk, called a\r\ncone, or cocoon, the thread of which is about three hundred yards long:\r\nin the centre of this ball the worm entombs itself, and experiences a\r\nchange to a state called an aurelia, or chrysallis, as seen below the\r\nball: from this aurelia, the moth that lays the eggs is hatched, and\r\nthus goes on the round of this animal's changes, or transmigrations."]

In [61]:
from sentence_transformers import SentenceTransformer, util

# a model pre-trained on an asymmetric semantic search task
sbert_model = SentenceTransformer('msmarco-distilbert-base-v4')

# Documents are encoded by calling model.encode()
document_embeddings = sbert_model.encode(documents)

In [62]:
sbert_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [73]:
sbert_model.encode(['hi'])[0][:5]

array([ 0.40406528, -0.16512644, -0.0930415 , -0.1626442 , -0.6941118 ],
      dtype=float32)

In [64]:
QUESTION = 'How many horns does a flea have?'  # a natural language query

query_embedding = sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document


In [65]:
top_scores.sort()

torch.return_types.sort(
values=tensor([[-0.1046, -0.1046, -0.1046, -0.1046, -0.1046, -0.0934, -0.0648, -0.0596,
         -0.0552, -0.0471, -0.0456, -0.0452, -0.0336, -0.0274, -0.0118, -0.0095,
         -0.0078, -0.0030,  0.0079,  0.0102,  0.0112,  0.0112,  0.0134,  0.0149,
          0.0199,  0.0208,  0.0208,  0.0243,  0.0305,  0.0335,  0.0395,  0.0400,
          0.0468,  0.0474,  0.0489,  0.0566,  0.0592,  0.0613,  0.0665,  0.0675,
          0.0969,  0.1088,  0.1099,  0.1182,  0.1272,  0.1417,  0.1619,  0.1848,
          0.2479,  0.4899]]),
indices=tensor([[ 6, 10, 15,  8, 12, 13, 44, 47, 46, 48, 43,  2,  3, 22, 14, 21, 37, 19,
         18,  1,  9, 11,  4, 49, 27,  7,  5, 25, 35, 36,  0, 39, 45, 38, 23, 30,
         20, 16, 17, 42, 40, 29, 24, 41, 34, 32, 28, 33, 31, 26]]))

In [66]:
top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

Top Document 1 Cos_Sim 0.490:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Top Document 2 Cos_Sim 0.248:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Top Document 3 Cos_Sim 0.185:

This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a foot in length: it som

In [67]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

bert_tokenizer = DistilBertTokenizer.from_pretrained('bert-base-cased')  # distilbert doesn't have token type IDs
qa_bert = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')


# Using a pretrained model to fine-tune our answers

In [68]:
from transformers import pipeline

qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='distilbert-base-uncased')


In [69]:
top_documents[0]

'When examined by a microscope, the flea is a pleasant object. The body\r\nis curiously adorned with a suit of polished armour, neatly jointed, and\r\nbeset with a great number of sharp pins almost like the quills of a\r\nporcupine: it has a small head, large eyes, two horns, or feelers, which\r\nproceed from the head, and four long legs from the breast; they are very\r\nhairy and long, and have several joints, which fold as it were one\r\nwithin another.'

In [70]:
qa(question=QUESTION, context=top_documents[0])

{'score': 0.6974557638168335, 'start': 259, 'end': 262, 'answer': 'two'}

# Using our pretrained model from the last use-case

In [153]:
from transformers import BertForQuestionAnswering

qa_bert_finetuned = BertForQuestionAnswering.from_pretrained('./qa/results')

finetuned_qa = pipeline('question-answering', model=qa_bert_finetuned, tokenizer='bert-large-uncased')


In [154]:
finetuned_qa(question=QUESTION, context=top_documents[0])

{'score': 0.01910480484366417, 'start': 259, 'end': 262, 'answer': 'two'}

In [18]:
# https://www.sbert.net/docs/pretrained_models.html for more fine-tuned models

In [20]:
from datasets import load_dataset

# load up the adversarial_qa dataset from the last use-case
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train')

good_training_data = []
bad_training_data = []
    
last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0.0))  #  add bad examples
    # question, context, label is 1 for these should be matched together
    good_training_data.append((example['question'], example['context'], 1.0))
    last_example = example

Reusing dataset adversarial_qa (/Users/sinanozdemir/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)


In [21]:
len(good_training_data), len(bad_training_data)

(30000, 2647)

In [22]:
# https://www.sbert.net/docs/training/overview.html for information on training

from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import sample, seed, shuffle

seed(42)  # seed our upcoming sample

sampled_training_data = sample(good_training_data, 500) + sample(bad_training_data, 500)

shuffle(sampled_training_data)


#Define the training examples
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:800]]


#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(sbert_model)

# Evaluation data
sentences1, sentences2, scores = zip(*sampled_training_data[800:])

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)


In [24]:
# Tune the model
sbert_model.fit(
    train_objectives=[(train_dataloader, train_loss)], output_path='ir/results',
    epochs=2, warmup_steps=50, 
    evaluator=evaluator, evaluation_steps=5
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

In [156]:
# load fine-tuned IR model
finetuned_sbert_model = SentenceTransformer('ir/results')

finetuned_sbert_model.encode(['hi'])[0][:5]  # different embedding as before which is expected


array([ 0.2940645 , -0.15025584, -0.1537622 , -0.15683304, -0.74979   ],
      dtype=float32)

In [74]:
# re-encode the documents and run the same question as before
document_embeddings = sbert_model.encode(documents)

query_embedding = sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document

top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

Top Document 1 Cos_Sim 0.600:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Top Document 2 Cos_Sim 0.393:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Top Document 3 Cos_Sim 0.348:

In examining the louse with a microscope, its external deformity strikes
us with disgust. It has six feet, two eyes, and a sort of sting,
proboscis, or sucker, with whi

In [93]:
def gutenberg_to_documents(url, sbert_model):
    response = urlopen(url)

    PARAGRAPH_SPLITTER = '\r\n\r\n'

    text = response.read().decode()
    try:
        text = text[text.index("***START OF THE PROJECT GUTENBERG") :text.index("***END OF THE PROJECT GUTENBERG")]
    except:
        pass

    documents = text.split(PARAGRAPH_SPLITTER)

    documents = list(filter(lambda x: len(x) > 25, documents))

    def preprocess(s):
        return s.strip()

    documents =  list(map(preprocess, documents))

    documents = np.array(documents)

    print(f'There are {len(documents)} documents')
    
    return documents, sbert_model.encode(documents)


In [164]:
def retrieve_relevant_documents(sbert_model, query, documents, embeddings, qa=None):
    query_embedding = sbert_model.encode(query)  # embed the query into a vector space

    top_scores = util.cos_sim(query_embedding, embeddings)  # use cosine similarity to find the most relevant document
    top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
    top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

    for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
        print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
        if qa:
            answer = qa(question=query, context=top_document)
            print(f'\nAnswer: {answer}\n')
        print('\n')

In [157]:
banks_to_bassoon_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt', finetuned_sbert_model
)

There are 1599 documents


In [165]:
retrieve_relevant_documents(finetuned_sbert_model,
    'What is a banshee?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    qa=qa
)

Top Document 1 Cos_Sim 0.761:

BANSHEE (Irish _bean sidhe_; Gaelic _ban sith_, "woman of the fairies"), a
supernatural being in Irish and general Celtic folklore, whose mournful
screaming, or "keening," at night is held to foretell the death of some
member of the household visited. In Ireland legends of the banshee belong
more particularly to certain families in whose records periodic visits from
the spirit are chronicled. A like ghostly informer figures in Brittany
folklore. The Irish banshee is held to be the distinction only of families
of pure Milesian descent. The Welsh have the banshee under the name _gwrach
y Rhibyn_ (witch of Rhibyn). Sir Walter Scott mentions a belief in the
banshee as existing in the highlands of Scotland (_Demonology and
Witchcraft_, p. 351). A Welsh death-portent often confused with the gwrach
y Rhibyn and banshee is the _cyhyraeth_, the groaning spirit.

Answer: {'score': 0.1614086925983429, 'start': 73, 'end': 94, 'answer': 'a\r\nsupernatural being'}



T

In [166]:
retrieve_relevant_documents(finetuned_sbert_model,
    'Where do you usually play amateur base ball?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    qa=qa
)

Top Document 1 Cos_Sim 0.563:

Amateur base-ball, in its organized phase, is played mostly by school and
university clubs as well as those of athletic associations. The first
college league was formed in 1879 and comprised Harvard, Princeton,
Amherst, Brown and Dartmouth, Yale joining a year later. The Eastern
College League, with Columbia, Harvard, Princeton and Yale, followed in
1887. This was afterwards dissolved and at present the most important
universities of the eastern states are members of no league, although such
organizations exist in New England and different parts of the west and
south. Amateur base-ball has progressed along the same lines as
professional, although the college playing rules formerly differed in
certain minor points from those of the professional leagues.

Answer: {'score': 0.48191583156585693, 'start': 63, 'end': 91, 'answer': 'school and\r\nuniversity clubs'}



Top Document 2 Cos_Sim 0.498:

BASE-BALL (so-called from the bases and ball used), the nationa


Answer: {'score': 0.11810270696878433, 'start': 1202, 'end': 1214, 'answer': 'in the field'}



