In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-base')
base_tokenizer = T5Tokenizer.from_pretrained('t5-base')

config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 8.24MB/s]
model.safetensors: 100%|██████████| 892M/892M [02:52<00:00, 5.16MB/s] 
generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 1.11MB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 818kB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:01<00:00, 1.07MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new be

## Abstractive Summarization

In [5]:
text_to_summarize = """Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco Bay with his dog, 
Charlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematics 
at John Hopkins University before transitioning to education. He spent several years conducting lectures 
on data science at John Hopkins University and at the General Assembly before founding his own startup, 
Kylie.ai, which uses artificial intelligence to build chatbots from historical transcripts. 
After completing a Fellowship at the Y Combinator accelerator, Sinan spend most of his time working on 
his fast-growing company, while creating educational material for data science.
"""

preprocess_text = text_to_summarize.strip().replace("\n", "")

print("original text preprocessed: \n", preprocess_text)

original text preprocessed: 
 Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco Bay with his dog, Charlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematics at John Hopkins University before transitioning to education. He spent several years conducting lectures on data science at John Hopkins University and at the General Assembly before founding his own startup, Kylie.ai, which uses artificial intelligence to build chatbots from historical transcripts. After completing a Fellowship at the Y Combinator accelerator, Sinan spend most of his time working on his fast-growing company, while creating educational material for data science.


In [7]:
# known prompt for summarization with T5
t5_prepared_text = "summarize: " + preprocess_text

input_ids = base_tokenizer.encode(t5_prepared_text, return_tensors="pt")

# summarize
summary_ids = base_model.generate(
    input_ids,
    num_beams = 4,
    no_repeat_ngram_size = 3,
    min_length = 30,
    max_length = 50,
    early_stopping = True
)

output = base_tokenizer.decode(summary_ids[0], skip_special_tokens = True)

print(f"Summarized text: \n{output}")

Summarized text: 
Sinan Ozdemir is a data scientist, startup founder, and educator. he founded his own startup, Kylie.ai, which uses artificial intelligence to build chatbots.


## English -> German Translation

In [8]:
input_ids = base_tokenizer.encode("translate English to German: Where is the chocolate?", return_tensors="pt")

# translate
translate_ids = base_model.generate(
    input_ids,
    num_beams = 4,
    no_repeat_ngram_size = 3,
    max_length = 20,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Translated text:\n{output}")

Translated text:
Wo ist die Schokolade?


In [10]:
# pass labels in to calculate loss

input_ids = base_tokenizer('translate English to German: Where is the chocolate?', return_tensors="pt").input_ids
labels = base_tokenizer('Wo ist die Schokolade?', return_tensors = 'pt').input_ids

loss = base_model(input_ids = input_ids, labels = labels).loss

labels, loss

(tensor([[ 3488,   229,    67, 31267,    58,     1]]),
 tensor(0.1136, grad_fn=<NllLossBackward0>))

## CoLA: The Corpus of Linguistic Acceptability

In [11]:
input_ids = base_tokenizer.encode('cola sentence: Where is the chocolate?', return_tensors = 'pt')

#CoLA
translate_ids = base_model.generate(
    input_ids,
    num_beams = 4,
    no_repeat_ngram_size = 3,
    max_length = 20,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"is grammatically correct?: \n{output}")

is grammatically correct?: 
acceptable


In [12]:
input_ids = base_tokenizer.encode('cola sentence: Where be a chocolate?', return_tensors = 'pt')

#CoLA
translate_ids = base_model.generate(
    input_ids,
    num_beams = 4,
    no_repeat_ngram_size = 3,
    max_length = 20,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"is grammatically correct?: \n{output}")

is grammatically correct?: 
unacceptable


## STSB - Semantic Text Similarity Benchmark
Are two sentences semantically similar?

In [13]:
sentence_one = "How to fish"
sentence_two = "Fishing Manual for beginners"

input_ids = base_tokenizer.encode(f"stsb sentence1: {sentence_one} sentence2: {sentence_two}", return_tensors='pt')

# calculate semantic similarity
translate_ids = base_model.generate(
    input_ids,
    max_length = 3,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"semantically similar? (1-5): \n{output}")

semantically similar? (1-5): 
3.2




In [14]:
sentence_one = "How to fish"
sentence_two = "Hiking Manual for beginners"

input_ids = base_tokenizer.encode(f"stsb sentence1: {sentence_one} sentence2: {sentence_two}", return_tensors='pt')

# calculate semantic similarity
translate_ids = base_model.generate(
    input_ids,
    max_length = 3,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"semantically similar? (1-5): \n{output}")

semantically similar? (1-5): 
0.0


## MNLI - Multi-Genre Natural Language Inference
Whether a premise implies ("entailment"), contradicts ("contradiction"), or neither ("neutral") a hypothesis.

In [16]:
input_ids = base_tokenizer.encode(f"mnli premise: I am active in politics. hypothesis: I am running for mayor", return_tensors='pt')

# mnli
translate_ids = base_model.generate(
    input_ids,
    max_length = 3,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
en


In [17]:
input_ids = base_tokenizer.encode(f"mnli premise: I am active in politics. hypothesis: I do not really vote", return_tensors='pt')

# mnli
translate_ids = base_model.generate(
    input_ids,
    max_length = 3,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
contradiction


In [18]:
input_ids = base_tokenizer.encode(f"mnli premise: I am active in politics. hypothesis: I code for a living", return_tensors='pt')

# mnli
translate_ids = base_model.generate(
    input_ids,
    max_length = 3,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
neutral


## Q/A - Question/Answering

In [26]:
input_ids = base_tokenizer.encode(
    "question: Where does Sinan live? context: Sinan lives in California but Matt lives in Boston", return_tensors='pt'
    )

# Q/A
translate_ids = base_model.generate(
    input_ids,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
California


In [27]:
input_ids = base_tokenizer.encode(
    "question: Where does Matt live? context: Sinan lives in California but Matt lives in Boston", return_tensors='pt'
    )

# Q/A
translate_ids = base_model.generate(
    input_ids,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
Boston


In [28]:
input_ids = base_tokenizer.encode(
    "prompt1: Where does Matt live? prompt2: Sinan lives in California but Matt lives in Boston", return_tensors='pt'
    )

# Q/A
translate_ids = base_model.generate(
    input_ids,
    early_stopping = True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens = True)

print(f"Response: \n{output}")

Response: 
not_duplicate
