## 9.1 Siamese BERT-networks for semantic searching

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

from random import sample, seed, shuffle
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader

In [36]:
PERSON = 'Sinan Ozdemir'

# Note this is NOT an efficient way to search on google. This is done simply for education purposes
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024]

nlp = pipeline('question-answering', 
               model='deepset/roberta-base-squad2', 
               tokenizer='deepset/roberta-base-squad2', 
               max_length=10)

nlp(f'Who is {PERSON}?', google_html)

{'score': 0.10961687564849854,
 'start': 545,
 'end': 591,
 'answer': 'data scientist, start-up founder, and educator'}

In [40]:
# textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

# strip out header and footer
# text = text[text.index("*** START OF") :text.index("*** END OF")]

# Only keep documents of at least 50 characters
documents = list(filter(lambda x: len(x) > 50, text.split('\r\n\r\n')))

documents = np.array(documents)

print(f'There are {len(documents)} documents/paragraphs')

There are 104 documents/paragraphs


In [8]:
# a model pre-trained on an asymmetric semantic search task
sbert_model = SentenceTransformer('msmarco-distilbert-base-v4')

sbert_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [9]:
# Documents are encoded by calling model.encode()
document_embeddings = sbert_model.encode(documents)

document_embeddings.shape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(116, 768)

In [10]:
QUESTION = 'How many horns does a flea have?'  # a natural language query

query_embedding = sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document

top_scores.sort()

torch.return_types.sort(
values=tensor([[-1.1211e-01, -1.0464e-01, -1.0464e-01, -1.0464e-01, -1.0464e-01,
         -1.0464e-01, -1.0030e-01, -1.0030e-01, -9.3436e-02, -9.2324e-02,
         -9.1212e-02, -8.8146e-02, -6.5523e-02, -6.4774e-02, -6.1653e-02,
         -6.0359e-02, -5.9610e-02, -5.9591e-02, -5.6180e-02, -5.5162e-02,
         -5.2260e-02, -4.7107e-02, -4.6308e-02, -4.5581e-02, -4.5156e-02,
         -4.4223e-02, -4.0409e-02, -3.9424e-02, -3.6698e-02, -3.4323e-02,
         -3.4124e-02, -3.3633e-02, -3.1443e-02, -2.7684e-02, -2.7428e-02,
         -2.3387e-02, -2.1927e-02, -2.1242e-02, -2.1079e-02, -1.8942e-02,
         -1.5934e-02, -1.5326e-02, -1.4372e-02, -1.4291e-02, -1.3951e-02,
         -1.3196e-02, -1.2449e-02, -1.1834e-02, -9.5105e-03, -9.4630e-03,
         -9.0947e-03, -9.0319e-03, -8.5620e-03, -7.7932e-03, -6.8224e-03,
         -6.7419e-03, -2.9636e-03,  3.8625e-04,  1.6381e-03,  4.9331e-03,
          5.9962e-03,  6.2124e-03,  6.3127e-03,  7.9495e-03,  8.8445e-03,
      

In [11]:
top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

print(f'Question: {QUESTION}')

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

Top Document 1 Cos_Sim 0.490:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Top Document 2 Cos_Sim 0.248:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Top Document 3 Cos_Sim 0.185:


This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a foot in length: it s

In [35]:
# answer the question from the top document
nlp(QUESTION, str(top_documents[0]))

{'score': 0.8524739146232605, 'start': 259, 'end': 262, 'answer': 'two'}

In [13]:
# This is called an "Open Book Q/A System"

In [14]:
# load up the adversarial_qa dataset from the Q/A use-case
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train')

good_training_data = []
bad_training_data = []
    
last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0))  #  add bad examples
    # question, context, label is 1 if should be matched together
    good_training_data.append((example['question'], example['context'], 1.0))
    last_example = example

Reusing dataset adversarial_qa (/Users/sinanozdemir/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)


In [15]:
len(good_training_data), len(bad_training_data)

(30000, 2647)

In [17]:
# https://www.sbert.net/docs/training/overview.html for information on training

seed(42)  # seed our upcoming sample

sampled_training_data = sample(good_training_data, 200) + sample(bad_training_data, 200)

shuffle(sampled_training_data)  # shuffle our data around

training_index = int(.8 * len(sampled_training_data))  # Get an 80/20 train/test split

In [42]:
# Define the training examples
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:training_index]]

train_examples[0].__dict__

{'guid': '',
 'texts': ('Who ranked IBM on greenness?',
  'IBM has 12 research laboratories worldwide, bundled into IBM Research. As of 2013[update] the company held the record for most patents generated by a business for 22 consecutive years. Its employees have garnered five Nobel Prizes, six Turing Awards, ten National Medals of Technology and five National Medals of Science. Notable company inventions or developments include the automated teller machine (ATM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the Universal Product Code (UPC), the financial swap, the Fortran programming language, SABRE airline reservation system, dynamic random-access memory (DRAM), copper wiring in semiconductors, the silicon-on-insulator (SOI) semiconductor manufacturing process, and Watson artificial intelligence.'),
 'label': 0.0}

In [18]:
# Define the train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

train_loss = losses.CosineSimilarityLoss(sbert_model)

In [19]:
# Evaluation data
sentences1, sentences2, scores = zip(*sampled_training_data[training_index:])

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

In [20]:
# Tune the model
sbert_model.fit(
    train_objectives=[(train_dataloader, train_loss)], 
    output_path='ir/results',
    epochs=1,
#     warmup_steps=len(sampled_training_data) // 5, 
    evaluator=evaluator, 
    evaluation_steps=10
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [43]:
# load fine-tuned IR model
finetuned_sbert_model = SentenceTransformer('ir/results')

In [44]:
# re-encode the documents and run the same question as before
document_embeddings = finetuned_sbert_model.encode(documents)

query_embedding = finetuned_sbert_model.encode(QUESTION)  # embed the query into a vector space

top_scores = util.cos_sim(query_embedding, document_embeddings)  # use cosine similarity to find the most relevant document

top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

print(f'Question: {QUESTION}')

for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
    print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
    print('\n')

Top Document 1 Cos_Sim 0.496:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Top Document 2 Cos_Sim 0.253:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.


Top Document 3 Cos_Sim 0.190:


This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a foot in length: it s

In [None]:
def gutenberg_to_documents(gutenberg_url, sbert_model):
    text = urlopen(gutenberg_url).read().decode()
    documents = np.array(list(filter(lambda x: len(x) > 50, text.split('\r\n\r\n'))))
    print(f'There are {len(documents)} documents/paragraphs')
    return documents, sbert_model.encode(documents)


def retrieve_relevant_documents(sbert_model, query, documents, embeddings, qa=None):
    query_embedding = sbert_model.encode(query)  # embed the query into a vector space

    top_scores = util.cos_sim(query_embedding, embeddings)  # use cosine similarity to find the most relevant document
    top_documents = documents[top_scores.sort().indices[0][-3:]][::-1]
    top_cosine_sim = list(top_scores.sort().values[0][-3:])[::-1]

    for i, (cos_sim, top_document) in enumerate(zip(top_cosine_sim, top_documents)):
        print(f'Top Document {i + 1} Cos_Sim {cos_sim:.3f}:\n\n{top_document}')
        if qa:
            answer = qa(question=query, context=top_document)
            print(f'\nAnswer: {answer}\n')
        print('\n')

In [None]:
banks_to_bassoon_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt', finetuned_sbert_model
)

In [None]:
retrieve_relevant_documents(finetuned_sbert_model,
    'What is a banshee?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    nlp
)

In [None]:
retrieve_relevant_documents(finetuned_sbert_model,
    'How do bassoons work?', banks_to_bassoon_documents, banks_to_bassoon_embeddings,
    nlp
)

## 9.2 Teaching GPT multiple tasks at once with prompt engineering

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         GPT2Tokenizer
import pandas as pd
from datasets import Dataset


In [None]:
# https://www.kaggle.com/snap/amazon-fine-food-reviews?select=Reviews.csv

reviews = pd.read_csv('../data/reviews.csv')

print(reviews.shape)

reviews.head()

In [None]:
reviews['Text'].str.len().plot(kind='hist', title='Histogram of Review Length')

In [None]:
reviews['Summary'].str.len().plot(kind='hist', title='Histogram of Summary Length')

In [None]:
# remove very short and very long summaries
reviews = reviews[(reviews['Summary'].str.len() >= 10) & (reviews['Summary'].str.len() < 25)]

reviews['Summary'].str.len().plot(kind='hist', title='Histogram of Summary Length')

In [None]:
reviews['Sentiment'] = reviews['Score'].map(lambda x: 'positive' if x >= 4 else 'neutral' if x == 3 else 'negative')

reviews = reviews.groupby('Sentiment', group_keys=False).apply(lambda x: x.sample(2000))

In [None]:
MODEL = 'distilgpt2'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

#add two prompts, one for each task
SENTIMENT_PROMPT = 'Sentiment Task'
SUMMARIZE_PROMPT = 'Summarize Task'
SENTIMENT_TOKEN = '\nSentiment:'
SUMMARIZE_TOKEN = '\nSummarize:'


In [None]:
reviews['sentiment_text'] = f'{SENTIMENT_PROMPT}\nReview: ' + reviews['Text'] + SENTIMENT_TOKEN +  ' ' + reviews['Sentiment'].astype(str)

reviews['summarize_text'] = f'{SUMMARIZE_PROMPT}\nReview: ' + reviews['Text'] + SUMMARIZE_TOKEN +  ' ' + reviews['Summary'].astype(str)

reviews['sentiment_text'].head(2).tolist()

In [None]:
reviews['summarize_text'].head(2).tolist()

In [None]:
reviews = reviews.sample(frac=1)

training_examples = reviews['summarize_text'].tolist() + reviews['sentiment_text'].tolist()

print(len(training_examples))

In [None]:
multi_task_df = pd.DataFrame({'text': training_examples})

data = Dataset.from_pandas(multi_task_df)

def preprocess(examples):
    results = tokenizer(examples['text'], truncation=True)
    return results

data = data.map(preprocess, batched=True)

data = data.train_test_split(train_size=.8)

In [None]:
model = GPT2LMHeadModel.from_pretrained(MODEL)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_multitask", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(data['train']) // 10,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.02,
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    save_strategy='steps'
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    data_collator=data_collator
)

trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_multitask')

generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

In [None]:
text_sample, score, summary = reviews.sample(1)[['Text', 'Sentiment', 'Summary']].values[0]

print(text_sample)
print(score)
print(summary)

num_tokens = len(tokenizer(text_sample)['input_ids'])
num_tokens

In [None]:
sentiment_text_sample = f'{SENTIMENT_PROMPT}\nReview: {text_sample}{SENTIMENT_TOKEN}'
summarize_text_sample = f'{SUMMARIZE_PROMPT}\nReview: {text_sample}{SUMMARIZE_TOKEN}'

In [None]:
for generated_text in generator(sentiment_text_sample, num_return_sequences=3, max_length=num_tokens + 1):
    print(generated_text['generated_text'])
    print('----')
    

In [None]:
for generated_text in generator(summarize_text_sample, num_return_sequences=3, max_length=num_tokens + 20):
    print(generated_text['generated_text'])
    print('----')
    