In [3]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from datasets import load_dataset, load_metric, Dataset
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay, \
    TFT5ForConditionalGeneration, T5Tokenizer

In [4]:
def preprocess_function(examples):
    """ Use tokenizer to preprocess data. """
    
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    prefix = "summarize: "

    inputs = [prefix + doc for doc in examples["string"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["label"], max_length=80, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]


    return model_inputs


def download_and_preprocess_data(dataset):
    """ Load dataset from HuggingFace and preprocess. """
    

    
    # Tokenized using preprocess_function
    tokenized_news = dataset.map(preprocess_function, batched=True)

    return tokenized_news

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

optimizer = AdamWeightDecay(
    learning_rate=2e-5, 
    weight_decay_rate=0.01
)

model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
model.compile(optimizer=optimizer)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model, 
    return_tensors="tf",
)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [130]:
data = pd.read_excel("test.xlsx")
dataset = Dataset.from_pandas(data)


In [131]:
tokenized_news = download_and_preprocess_data(dataset)
tokenized_news

  0%|          | 0/1 [00:00<?, ?ba/s]



Dataset({
    features: ['string', 'label', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 122
})

In [132]:
attention_mask = tokenized_news["attention_mask"]
for i in range(len(attention_mask)):
    for j in range(len(attention_mask[i])):
        attention_mask[i][j] = np.int32(attention_mask[i][j])

In [143]:
print(type(tokenized_news["input_ids"]))

<class 'list'>


In [112]:
ds = tokenized_news.to_pandas()
ds


Unnamed: 0,string,label,id,input_ids,attention_mask,labels
0,Most apples are a little sweet and a little ta...,A description and explanation of how apples taste,1,"[21603, 10, 1377, 16981, 33, 3, 9, 385, 2093, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[71, 4210, 11, 7295, 13, 149, 16981, 2373, 1]"
1,Apples rank among the world's most popular fru...,a summary of different uses for apples,2,"[21603, 10, 2184, 7, 11003, 859, 8, 296, 31, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 9, 9251, 13, 315, 2284, 21, 16981, 1]"
2,"As white light passes through our atmosphere, ...",explanation of why the sky is blue,3,"[21603, 10, 282, 872, 659, 9016, 190, 69, 4643...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7295, 13, 572, 8, 5796, 19, 1692, 1]"
3,"In other words, the color of the ocean and the...",a distinction between why the ocean is blue an...,4,"[21603, 10, 86, 119, 1234, 6, 8, 945, 13, 8, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 9, 13005, 344, 572, 8, 5431, 19, 1692, 11,..."
4,Color of Sunlight as seen on Earth's surface d...,explanation of why the sun is yellow,5,"[21603, 10, 6088, 13, 3068, 2242, 38, 894, 30,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7295, 13, 572, 8, 1997, 19, 4459, 1]"
...,...,...,...,...,...,...
117,"Washington, D.C., D.C. in full District of Col...",specification of location of washignton DC,118,"[21603, 10, 2386, 6, 309, 5, 254, 5, 6, 309, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[16726, 13, 1128, 13, 6179, 3191, 17, 106, 579..."
118,"Concisely, AI can be described as the effort t...",comparison of AI and machine learning,119,"[21603, 10, 1193, 75, 159, 15, 120, 6, 7833, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4993, 13, 7833, 11, 1437, 1036, 1]"
119,How many layers contribute to a model of ...,definition of the depth of a model,120,"[21603, 10, 571, 186, 7500, 4139, 12, 3, 9, 82...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4903, 13, 8, 4963, 13, 3, 9, 825, 1]"
120,Functional anatomy Muscles Mucosa Lymphatics I...,list of functional anatomy,121,"[21603, 10, 27155, 27782, 6887, 2482, 7, 4159,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[570, 13, 5014, 27782, 1]"


In [148]:
data_collator = DataCollatorForSeq2Seq(tokenizer)
test_ds = tokenized_news.to_tf_dataset(
    columns=["attention_mask","input_ids","labels"],
    shuffle=False,
    batch_size=122,
    collate_fn=data_collator,
)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [127]:
test_ds = tokenized_news.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

RuntimeError: Unrecognized array dtype object. 
Nested types and image/audio types are not supported yet.

In [8]:
def compute_metrics(metric, pred, actual):
    """ Compute the model's rouge performance on an instance. """

    metric.add(predictions=pred, references=actual)
    final_score = metric.compute()
    
    return final_score

In [7]:
metric = load_metric('rouge')
result = [[] for x in range(3)]

cnt = 0
for item in test_ds:
    article = item['input_ids']
    actual = item['labels']
    
    pred = model.generate(
        do_sample=True,
        input_ids=article,
        # min_length=56,
        max_length=80,
        temperature=0.8, 
        top_k=45,
        no_repeat_ngram_size=3,
        num_beams=5,
        early_stopping=True
    )

    rouge_score = compute_metrics(metric, pred, actual)
    rouge1 = 100 * rouge_score['rouge1'][1][2]
    rouge2 = 100 * rouge_score['rouge2'][1][2]
    rougeL = 100 * rouge_score['rougeL'][1][2]

    cnt += 1 
    if cnt % 25 == 0:
        print(f'Round: {cnt * 4}')

    result[0].append(rouge1)
    result[1].append(rouge2)
    result[2].append(rougeL)

  metric = load_metric('rouge')


NameError: name 'test_ds' is not defined

In [25]:
result[2]

[25.72463768115941,
 25.517241379310345,
 30.0,
 17.721518987341774,
 20.0,
 23.154362416107382,
 18.345323741007196,
 27.666666666666668,
 25.35714285714285,
 20.921985815602838,
 21.428571428571427,
 23.36065573770492,
 23.46938775510204,
 22.887323943661972,
 24.013157894736842,
 22.535211267605636,
 28.859060402684566,
 30.000000000000004,
 22.697368421052634,
 28.24427480916031,
 28.333333333333332,
 20.833333333333336,
 19.48529411764706,
 19.327731092436974,
 18.75,
 19.485294117647058,
 24.475524475524477,
 20.542635658914726,
 19.23076923076923,
 30.00000000000001,
 18.835616438356162,
 19.178082191780824,
 21.875,
 21.428571428571427,
 27.241379310344826,
 21.48148148148148,
 16.887417218543042,
 22.10144927536232,
 20.895522388059703,
 18.750000000000004,
 19.776119402985078,
 19.666666666666668,
 19.485294117647058,
 27.04918032786885,
 20.229007633587788,
 20.370370370370374,
 17.905405405405407,
 22.468354430379744,
 18.75,
 17.64705882352941,
 20.967741935483872,
 17.375

In [10]:
test = "The death toll from a strong earthquake in south-eastern Turkey, near Syria's border, could rise eight-fold, the World Health Organisation has warned."

In [11]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_input = tokenizer("summarize: " + test, max_length=1024, truncation=True, return_tensors='tf')

pred = model.generate(
    do_sample=True,
    input_ids=tokenized_input['input_ids'],
    min_length=56,
    max_length=128,
    temperature=0.8, 
    top_k=45,
    no_repeat_ngram_size=3,
    num_beams=5,
    early_stopping=True
)

pred_sentence = tokenizer.decode(pred[0], skip_special_tokens=True)

print(f"pred = {pred_sentence}\n")

pred = death toll from a strong earthquake in south-eastern Turkey could rise eightfold. the earthquake near Syria's border could rise 8fold, the world health organisation warns. a quake near the border could cause a tsunami.



In [None]:
test