In [None]:
!nvidia-smi

In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import os
import json

import torch
from datasets import load_dataset, load_metric

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

In [None]:
model_name_or_path = "flax-community/t5-recipe-generation"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(model_name_or_path, device)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)

In [None]:
prefix = "items: "
genkw = {
    "max_length": 512,
    "min_length": 64,
    "no_repeat_ngram_size": 3,
    "do_sample": True,
    "top_k": 60,
    "top_p": 0.95,
    "num_return_sequences": 5
}

In [None]:
dataset = load_dataset("csv", data_files={"test": "test-5000.csv"}, delimiter="\t")["test"]
dataset

In [None]:
def predict(batch):
    inputs = [prefix + inp for inp in batch["inputs"]]

    inputs = tokenizer(
        inputs, 
        max_length=256, 
        padding=True, 
        truncation=True, 
        return_tensors='pt'
    )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            **genkw
        )

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    batch["predicted"] = [outputs[i:i + genkw["num_return_sequences"]] for i in range(0, len(outputs), genkw["num_return_sequences"])]
    return batch
     

In [None]:
sample = dataset.select(range(5))
result = dataset.map(predict, batched=True, batch_size=8)
result

In [None]:
df_list = []
for i in tqdm(range(len(result)), position=0):
    inputs, targets, predicted =  result["inputs"][i], result["targets"][i], result["predicted"][i]
    df_list.append({
        "input": inputs,
        "target": targets,
        "predicted": predicted,
    })

df = pd.DataFrame(df_list)
df.to_json("/content/predicted.json")
df.head()

In [None]:
df = pd.read_json("/content/t5_predicted.json")
df.head()