In [1]:
!pip install datasets



In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding
)

# Prepare dataset

In [3]:
dataset = load_dataset("sealuzh/app_reviews", split="train")
dataset.to_pandas().sample(10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/288065 [00:00<?, ? examples/s]

Unnamed: 0,package_name,review,date,star
49298,org.xbmc.kore,Nice app I am missing support for YouTube pla...,February 16 2016,3
42021,com.watabou.pixeldungeon,Good But hunger goes to quickly and the bosses...,December 05 2015,4
163466,com.simplemobiletools.musicplayer,Wish there is a folder mode or able to exclude...,January 02 2017,4
239541,com.google.android.gms,Its good,March 11 2017,4
60897,it.greenaddress.cordova,Will not log in Have had no issues until yeste...,January 04 2017,1
142526,org.telegram.messenger,Needs some work How to find that msg was deliv...,January 04 2017,2
180567,com.google.android.gms,Great Worst Waste of update then most memory...,December 07 2016,1
97710,com.ichi2.anki,Best app for flashcards,July 30 2016,5
254393,com.google.android.gms,great app,April 04 2017,5
9841,biz.gyrus.yaab,The best The only way this could be better is ...,February 04 2016,5


In [4]:
wanted_features = ["package_name", "review", "star"]
dataset = dataset.remove_columns([x for x in dataset.features if x not in wanted_features])
dataset.to_pandas().sample(10)
len(dataset)

288065

In [5]:
dataset = dataset.shuffle().select(range(100000))
len(dataset)

100000

In [6]:
dataset = dataset.class_encode_column("star")
dataset = dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="star")

Stringifying the column:   0%|          | 0/100000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [7]:
dataset, len(dataset['train']), dataset['train'].to_pandas().sample(10)

(DatasetDict({
     train: Dataset({
         features: ['package_name', 'review', 'star'],
         num_rows: 90000
     })
     test: Dataset({
         features: ['package_name', 'review', 'star'],
         num_rows: 10000
     })
 }),
 90000,
                           package_name  \
 88267               jwtc.android.chess   
 59791           com.google.android.gms   
 22879           com.google.android.gms   
 51824             info.papdt.blackblub   
 74892    com.simplemobiletools.gallery   
 17245     com.google.android.diskusage   
 31324    com.duckduckgo.mobile.android   
 19064           com.google.android.gms   
 47289  com.google.zxing.client.android   
 52991          com.nilhcem.hostseditor   
 
                                                   review  star  
 88267  Wont load files. Can import db which is good. ...     3  
 59791  Worst ever app Guge in size...consume huge spa...     0  
 22879                                 Nice Super concept     4  
 51824  Good m

In [8]:
train_dataset = dataset['train']
test_dataset = dataset['test']

# Training

In [9]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_data(samples):
    # we create a prefix "review:" for the model
    samples['prompt'] = [f"review: {package_name}, {star} Stars!" \
                         for package_name, star in zip(samples['package_name'], samples['star'])]
    samples['response'] = [f"{review}" for review in samples['review']]
    inputs = tokenizer(samples['prompt'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(samples['response'], padding="max_length", truncation=True, max_length=128)
    inputs.update({'labels': targets['input_ids']})

    return inputs

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

TRAINING_OUTPUT="./t5_fine-tuned-reviews"
training_args = TrainingArguments(
    output_dir = TRAINING_OUTPUT,
    num_train_epochs = 3,
    per_device_train_batch_size = 24,
    per_device_eval_batch_size = 24,
    save_strategy = "epoch",
)
trainer = Trainer(
    model =  model,
    args = training_args,
    train_dataset = train_dataset,
    data_collator = data_collator
)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
trainer.train()

Step,Training Loss
500,0.7881
1000,0.5479
1500,0.537
2000,0.5217
2500,0.5237
3000,0.5153
3500,0.525
4000,0.5054
4500,0.5143
5000,0.4904


# Inference

In [None]:
def generate_review(text):
    inputs = tokenizer("review: " + text, return_tensors='pt', max_length=512, padding="max_length", truncation=True)
    # no_repeat_ngrams make the model respond at least 3 words
    # num_beams controls the quality of output by allowing model to think longer (exploring more answers and choose one)
    # early_stopping allows model to give shorter response if it believes it's good enough already
    outputs = model.generate(inputs['input_ids'], max_length=128,
                             no_repeat_ngram_size=3, num_beams=6, early_stopping=True)
    review = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return review

In [None]:
random_product = test_dataset.shuffle(42).select(range(10))['package_name']
generate_review(random_product[1] + ", 5 Stars!")