In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
import pandas as pd

def linearize1(table, highlighted_cells, value_sep=" : ", row_sep=" ; ", return_text=True, includes_header=True,):
    table = pd.DataFrame(table)
    table = table.applymap(lambda x: " , ".join(x) if isinstance(x, list) else x)
    values = []
    if highlighted_cells is None:
        for i in range(len(table)):
            for j in range(len(table.columns)):
                if includes_header:
                    values.append(table.columns[j] + value_sep + table.iloc[i, j])
                else:
                    values.append(table.iloc[i, j])
    else:
        for i, j in highlighted_cells:
            if includes_header:
                values.append(table.columns[j] + value_sep + table.iloc[i, j])
            else:
                values.append(table.iloc[i, j])
    if return_text:
        input = row_sep.join(values)
        return input
    return values

In [4]:
import jsonlines
import json

predictions = []
references = []

with jsonlines.open("../data/totto/train.jsonl") as reader:
    for jobj in list(reader):
        predictions.append(linearize1(json.loads(jobj["table"]), jobj["highlighted_cells"]))
        references.append(jobj["sentence_annotations"][0]["final_sentence"])

In [5]:
from datasets import Dataset

dataset = Dataset.from_dict({"predictions": predictions, "references": references})

In [6]:
def tokenize(batch):
    return tokenizer(batch["predictions"], batch["references"], truncation=True, padding=True)

In [7]:
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["predictions", "references"])

100%|██████████| 3774/3774 [00:30<00:00, 122.19ba/s]


In [8]:
datasets = tokenized_dataset.train_test_split(train_size=0.01, test_size=0.01)

In [9]:
train_dataset, test_dataset = datasets["train"], datasets["test"]

In [10]:
import evaluate
import numpy as np
bleu = evaluate.load("bleu")


def lmap(f, x):
    """list(map(f, x))"""
    return list(map(f, x))

def compute_metrics(eval_predictions):
    def non_pad_len(tokens: np.ndarray) -> int:
        return np.count_nonzero(tokens != tokenizer.pad_token_id)

    def decode_pred(pred) :
        pred_ids = pred.predictions
        label_ids = pred.label_ids
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_ids[label_ids == -100] = tokenizer.pad_token_id
        label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
        pred_str = lmap(str.strip, pred_str)
        label_str = lmap(str.strip, label_str)
        return pred_str, label_str

    def translation_metrics(pred):
        pred_str, label_str = decode_pred(pred)
        return bleu.compute(pred_str, label_str)

    return translation_metrics(eval_predictions)

In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

args = TrainingArguments(output_dir="../temp/totto")

data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics, data_collator=data_collator)

NVIDIA GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
trainer.train()

***** Running training *****
  Num examples = 1207
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 114
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mclapika[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


: 

: 

In [None]:
import evaluate

results = bleu.compute(predictions=predictions, references=references)