In [1]:
import torch
from transformers import T5Tokenizer
import common as com
from torch.utils.data import DataLoader
from IPython.display import Markdown, display
from rouge_score import rouge_scorer

In [3]:
def printmd(string):
    display(Markdown(string))

In [4]:
model_name = "31_model_w_prefix_50_output.pt"
model = torch.load(f"models/{model_name}")
model.eval();
tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=com.MAX_INPUT_LEN)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [5]:
def get_summary(text: str) -> str:
    text = "summarize: " + text
    encoded_text = tokenizer(text, return_tensors="pt")
    output_ids = model.generate(encoded_text.input_ids, 
                                attention_mask=encoded_text.attention_mask, 
                                max_length=com.MAX_OUTPUT_LEN)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text

In [6]:
_, test_data = com.load_and_maybe_split_data(com.TRAIN_FRACTION, com.RANDOM_SEED)
test_data = com.ReviewsDataset(test_data, tokenizer, com.MAX_INPUT_LEN, com.MAX_OUTPUT_LEN)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [7]:
predictions = []
targets = []
texts = []
for data in test_loader:
    target_ids = data['output_ids']
    input_ids = data['input_ids']
    input_mask = data['input_mask']
    input_text = data['review_text']

    predicted_ids = model.generate(
        input_ids=input_ids,
        attention_mask=input_mask,
        max_length=com.MAX_OUTPUT_LEN
    )
    predictions_ = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                    for g in predicted_ids]
    targets_ = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in
                target_ids]

    predictions.extend(predictions_)
    targets.extend(targets_)
    texts.extend(input_text)

In [8]:
index_list = iter(range(len(predictions)))

In [9]:
len(predictions) == len(targets) == len(texts)

True

In [55]:
i = next(index_list)

target_sum = targets[i]
generated_sum = predictions[i]
review = texts[i]

len_original = len(review.split(' '))
len_tar_summary = len(target_sum.split(' '))
len_gen_summary = len(generated_sum.split(' '))
scores = scorer.score(target_sum, generated_sum)

printmd(f"**Original review**: {review} (**{len_original}**)\n")
printmd(f"**Target summary**: {target_sum} (**{len_tar_summary}**)\n")
printmd(f"**Gererated summary**: {generated_sum} (**{len_gen_summary}**)\n")
print("Rouge1 F1 vscore: {0:.2f}\n".format(scores['rouge1'].fmeasure))
print("RougeL F1 vscore: {0:.2f}\n".format(scores['rougeL'].fmeasure))



**Original review**: On full charge you need to have this on the highest power to pick up which means the charge runs out very quickly. I brought this for my mum with Parkinson’s thinking it would be ideal but unfortunately I have only just visited her so I’m over my return period otherwise it would have been going straight back! (**58**)


**Target summary**: The vacuum's short battery life and weak suction on lower power settings make it inconvenient. Not suitable for users with specific needs, and missed the return window. (**27**)


**Gererated summary**: The charger is on the highest power to pick up which means the charge runs out very quickly. I brought this for my mum with Parkinson’s thinking it would be ideal but unfortunately I have only just visited her so I’m over my return period. (**45**)


Rouge1 F1 vscore: 0.24

RougeL F1 vscore: 0.16



In [21]:
def summarize_one_text(text: str) -> str:
    encoded = tokenizer(text, max_length=com.MAX_INPUT_LEN, pad_to_max_length=True,
                                       truncation=True,
                                       padding="max_length", return_tensors="pt")
    pred_ids = model.generate(
        input_ids=encoded.input_ids,
        attention_mask=encoded.attention_mask,
        max_length=com.MAX_OUTPUT_LEN)
    pred_text = tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return pred_text
                

In [22]:
text = "I got this as I couldn't afford a higher end vacuum and am pleasantly surprised by how useful it is. It's extremely light and has lots of functions to get into different spaces. The downside is that the battery only lasts about half an hour and the tank fills up quickly, so it's better for a small home."
summary = summarize_one_text(text)
printmd(summary)

I wanted something more affordable. it's very light and has lots of functions. it's very light and has lots of functions. The only downside is that the battery lasts about half an hour and the tank fills up quickly.. The battery life is limited and the tank fills up quickly, so it's better for a small home.


In [23]:
printmd(summary)

I wanted something more affordable. it's very light and has lots of functions. it's very light and has lots of functions. The only downside is that the battery lasts about half an hour and the tank fills up quickly.. The battery life is limited and the tank fills up quickly, so it's better for a small home.