In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

from random import sample, seed, shuffle
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
2024-02-12 22:49:19.320702: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-12 22:49:19.320761: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-12 22:49:19.320779: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-12 22:49:19.324964: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PERSON = "Randi Eka Sanjaya"

# Note this is NOT an efficient way to search on google. This is done simply for education purposes
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024]

nlp = pipeline(
    'question-answering',
    model = 'deepset/roberta-base-squad2',
    tokenizer = 'deepset/roberta-base-squad2',
    max_length = 10
)

nlp(f'Who is {PERSON}?', google_html)

{'score': 0.3244840204715729,
 'start': 400,
 'end': 414,
 'answer': 'Data Scientist'}

In [3]:
# textbook about insects
text = urlopen("https://www.gutenberg.org/cache/epub/10834/pg10834.txt").read().decode()

# Only keep documents of at least 100 chatacters
documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n')))

documents = np.array(documents)

print(f'There are {len(documents)} documents/paragraphs')

There are 70 documents/paragraphs


In [4]:
# This model pre-trained on an asymetric semantic search task
# We use the Bi-Encoder to encode all the documents, so that we can use it with semantic search
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens

bi_encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
)

In [5]:
# Documents are encoded by calling model.encode().
document_embeddings = bi_encoder.encode(documents, convert_to_tensor = True, show_progress_bar = True)

document_embeddings.shape

Batches: 100%|██████████| 3/3 [00:01<00:00,  2.60it/s]


torch.Size([70, 768])

In [6]:
QUESTION = "How many horns does a flea have?" # a natural language query

In [7]:
# Encode the query using bi-encoder and find relevant documents
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor = True)

# Number of documents to retrieve with the bi-encoder
hits = util.semantic_search(question_embedding, document_embeddings, top_k = 3)[0]

hits

[{'corpus_id': 15, 'score': 0.48994922637939453},
 {'corpus_id': 20, 'score': 0.24793772399425507},
 {'corpus_id': 22, 'score': 0.1847882866859436}]

In [8]:
print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):

    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print("\n")

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.490:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Document 2 Cos_Sim 0.248:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Document 3 Cos_Sim 0.185:


This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a 

In [9]:
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

{'score': 0.8524739742279053, 'start': 259, 'end': 262, 'answer': 'two'}

In [10]:
# This is called an "Open Book Q/A" System

In [11]:
# Load up the adversarial_qa_dataset from the Q/A use-case
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split = 'train')

good_training_data = []
bad_training_data = []

last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], float(0))) # add neutral examples
    # Question, context, label is 1 if should be matched together
    good_training_data.append((example['question'], example['context'], float(1)))
    last_example = example

In [12]:
len(good_training_data), len(bad_training_data)

(30000, 2647)

In [13]:
good_training_data[-1]

('What letter designates what Ektachrome is designed for?',
 'Some high-speed black-and-white films, such as Ilford Delta 3200 and Kodak T-MAX P3200, are marketed with film speeds in excess of their true ISO speed as determined using the ISO testing method. For example, the Ilford product is actually an ISO 1000 film, according to its data sheet. The manufacturers do not indicate that the 3200 number is an ISO rating on their packaging. Kodak and Fuji also marketed E6 films designed for pushing (hence the "P" prefix), such as Ektachrome P800/1600 and Fujichrome P1600, both with a base speed of ISO 400.',
 1.0)

In [14]:
bad_training_data[-1]

('What film beside Ektachrome and Fujichorme is designed for pushing?',
 'The Weston Cadet (model 852 introduced in 1949), Direct Reading (model 853 introduced 1954) and Master III (models 737 and S141.3 introduced in 1956) were the first in their line of exposure meters to switch and utilize the meanwhile established ASA scale instead. Other models used the original Weston scale up until ca. 1955. The company continued to publish Weston film ratings after 1955, but while their recommended values often differed slightly from the ASA film speeds found on film boxes, these newer Weston values were based on the ASA system and had to be converted for use with older Weston meters by subtracting 1/3 exposure stop as per Weston\'s recommendation. Vice versa, "old" Weston film speed ratings could be converted into "new" Westons and the ASA scale by adding the same amount, that is, a film rating of 100 Weston (up to 1955) corresponded with 125 ASA (as per ASA PH2.5-1954 and before). This conver

In [15]:
# https://www.sbert.net/docs/training/overview.html for more information on training

seed(42) # seed our upcoming sample

sampled_training_data = sample(good_training_data, 500) + sample(bad_training_data, 500)

shuffle(sampled_training_data) # shuffle our data around

training_index = int(.8 * len(sampled_training_data)) # Get an 80/20 train/test split

In [16]:
# Define the training examples
train_examples = [InputExample(texts = t[:2], label = t[2]) for t in sampled_training_data[:training_index]]

train_examples[0].__dict__

{'guid': '',
 'texts': ('What changed after the eigth century?',
  'There is disagreement about the origin of the term, but general consensus that "cardinalis" from the word cardo (meaning \'pivot\' or \'hinge\') was first used in late antiquity to designate a bishop or priest who was incorporated into a church for which he had not originally been ordained. In Rome the first persons to be called cardinals were the deacons of the seven regions of the city at the beginning of the 6th century, when the word began to mean “principal,” “eminent,” or "superior." The name was also given to the senior priest in each of the "title" churches (the parish churches) of Rome and to the bishops of the seven sees surrounding the city. By the 8th century the Roman cardinals constituted a privileged class among the Roman clergy. They took part in the administration of the church of Rome and in the papal liturgy. By decree of a synod of 769, only a cardinal was eligible to become pope. In 1059, during th

In [17]:
# Define the train dataset, a dataloader and the train loss
# A data loader is the object that specifically shuffles/grabs batches of data from a Dataset
# We don't usually have to explicitly create one using the Trainer because it has a default loader build in
train_dataloader = DataLoader(train_examples, shuffle = True, batch_size = 32, collate_fn = bi_encoder.smart_batching_collate)

train_loss = losses.CosineSimilarityLoss(bi_encoder)

In [18]:
(question_batch, context_batch), labels = next(iter(train_dataloader)) # get a sample batch of data

question_batch['input_ids'].shape, context_batch['input_ids'].shape, labels.shape

(torch.Size([32, 36]), torch.Size([32, 256]), torch.Size([32]))

In [19]:
# Evaluation data, sentences1 and sentences2 are lists of questions and context respectively and scores are 0 or 1
sentences1, sentences2, scores = zip(*sampled_training_data[training_index:])

# Evaluator will evaluate embedding closeness
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

In [20]:
bi_encoder.evaluate(evaluator) # Initial evaluation (higher embedding similarity is better)

0.5044913287672261

In [21]:
# Fine-tune the model using the fit method
bi_encoder.fit(
    train_objectives = [(train_dataloader, train_loss)],
    output_path = 'ir/results',
    epochs = 2,
    evaluator = evaluator
)

Iteration: 100%|██████████| 25/25 [00:10<00:00,  2.39it/s]
Iteration: 100%|██████████| 25/25 [00:10<00:00,  2.34it/s]
Epoch: 100%|██████████| 2/2 [00:23<00:00, 11.92s/it]


In [22]:
bi_encoder.evaluate(evaluator) # final evaluation (higher embedding similarity is better)
# Not a huge jump in performance with 2 epochs. We could try more data or more epochs

0.5050109764878448

In [23]:
# load fine-tuned IR model
finetuned_bi_encoder = SentenceTransformer('ir/results')

In [24]:
# Documents are encoded by calling model.encode().
document_embeddings = finetuned_bi_encoder.encode(documents, convert_to_tensor = True, show_progress_bar = True)

# Encode the query using bi-encoder and find relevant documents
question_embedding = finetuned_bi_encoder.encode(QUESTION, convert_to_tensor = True)

# Number of documents to retrieve with the bi-encoder
hits = util.semantic_search(question_embedding, document_embeddings, top_k = 3)[0]

print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):

    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print("\n")


Batches: 100%|██████████| 3/3 [00:00<00:00,  6.88it/s]

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.492:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Document 2 Cos_Sim 0.250:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Document 3 Cos_Sim 0.187:


This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a 




In [25]:
def gutenberg_to_documents(guteberg_url, bi_encoder):
    text = urlopen(guteberg_url).read().decode()
    documents = np.array(list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n'))))
    print(f"There are {len(documents)} documents/paragraphs")
    return documents, bi_encoder.encode(documents)

def retrieve_relevant_documents(bi_encoder, query, documents, document_embeddings, hits = 3):
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

    hits = util.semantic_search(query_embedding, document_embeddings, top_k = hits)[0]

    for i, hit in enumerate(hits):
        print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
        print('\n')
    print(f"Answer from Top Document: {nlp(query, str(documents[hits[0]['corpus_id']]))}")

In [26]:
bank_to_basson_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt', finetuned_bi_encoder
)

There are 1396 documents/paragraphs


In [27]:
retrieve_relevant_documents(finetuned_bi_encoder,
    'What is a banshee?',
    bank_to_basson_documents,
    banks_to_bassoon_embeddings,
    2
)

Document 1 Cos_Sim 0.754:

BANSHEE (Irish _bean sidhe_; Gaelic _ban sith_, "woman of the fairies"), a
supernatural being in Irish and general Celtic folklore, whose mournful
screaming, or "keening," at night is held to foretell the death of some
member of the household visited. In Ireland legends of the banshee belong
more particularly to certain families in whose records periodic visits from
the spirit are chronicled. A like ghostly informer figures in Brittany
folklore. The Irish banshee is held to be the distinction only of families
of pure Milesian descent. The Welsh have the banshee under the name _gwrach
y Rhibyn_ (witch of Rhibyn). Sir Walter Scott mentions a belief in the
banshee as existing in the highlands of Scotland (_Demonology and
Witchcraft_, p. 351). A Welsh death-portent often confused with the gwrach
y Rhibyn and banshee is the _cyhyraeth_, the groaning spirit.


Document 2 Cos_Sim 0.324:

BANNU, a town and district of British India, in the Derajat division of the
Nor

In [28]:
retrieve_relevant_documents(finetuned_bi_encoder,
    'When was the Imperial Bank of Germany founded?',
    bank_to_basson_documents,
    banks_to_bassoon_embeddings,
    2
)

Document 1 Cos_Sim 0.797:

[3] The date 1876 is taken as being that when the Imperial Bank of Germany
came into full operation.


Document 2 Cos_Sim 0.573:

Similar banks had been established in Middelburg, (March 28th, 1616), in
Hamburg (1619) and in Rotterdam (February 9th, 1635). Of these the Bank of
Hamburg carried on much the largest business and survived the longest. It
was not till the 15th of February 1873 that its existence was closed by the
act of the German parliament which decreed that Germany should possess a
gold standard, and thus removed those conditions of the local medium of
exchange--silver coins of very different intrinsic values--whose
circulation had provided an ample field for the operations of the bank. The
business of the Bank of Hamburg had been conducted in absolute accordance
with the regulations under which it was founded.


Answer from Top Document: {'score': 0.18934154510498047, 'start': 13, 'end': 17, 'answer': '1876'}
