In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding
)

# Prepare dataset

In [None]:
dataset = load_dataset("sealuzh/app_reviews", split="train")
dataset.to_pandas().sample(10)

In [None]:
wanted_features = ["package_name", "review", "star"]
dataset = dataset.remove_columns([x for x in dataset.features if x not in wanted_features])
dataset.to_pandas().sample(10)
len(dataset)

In [None]:
dataset = dataset.shuffle().select(range(100000))
len(dataset)

In [None]:
dataset = dataset.class_encode_column("star")
dataset = dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="star")

In [None]:
dataset, len(dataset['train']), dataset['train'].to_pandas().sample(10)

In [None]:
train_dataset = dataset['train']
test_dataset = dataset['test']

# Training

In [None]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_data(samples):
    # we create a prefix "review:" for the model
    samples['prompt'] = [f"review: {package_name}, {star} Stars!" \
                         for package_name, star in zip(samples['package_name'], samples['star'])]
    samples['response'] = [f"{review}" for review in samples['review']]
    inputs = tokenizer(samples['prompt'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(samples['response'], padding="max_length", truncation=True, max_length=128)
    inputs.update({'labels': targets['input_ids']})

    return inputs

In [None]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

TRAINING_OUTPUT="./t5_fine-tuned-reviews"
training_args = TrainingArguments(
    output_dir = TRAINING_OUTPUT,
    num_train_epochs = 3,
    per_device_train_batch_size = 12,
    per_device_eval_batch_size = 12,
    save_strategy = "epoch",
)
trainer = Trainer(
    model =  model,
    args = training_args,
    train_dataset = train_dataset,
    data_collator = data_collator
)

In [None]:
model

In [None]:
trainer.train()

# Inference

In [None]:
def generate_review(text):
    inputs = tokenizer("review: " + text, return_tensors='pt', max_length=512, padding="max_length", truncation=True)
    # no_repeat_ngrams make the model respond at least 3 words
    # num_beams controls the quality of output by allowing model to think longer (exploring more answers and choose one)
    # early_stopping allows model to give shorter response if it believes it's good enough already
    outputs = model.generate(inputs['input_ids'], max_length=128,
                             no_repeat_ngram_size=3, num_beams=6, early_stopping=True)
    review = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return review

In [None]:
random_product = test_dataset.shuffle(42).select(range(10))['package_name']
generate_review(random_product[1] + ", 5 Stars!")