In [1]:
import os
import gc
import random
from pathlib import Path

import datasets
import nltk
import numpy as np
import pandas as pd
import torch
import transformers

from datasets import Dataset, concatenate_datasets
from evaluate import combine, load
from functional import seq
from huggingface_hub import notebook_login
from IPython.display import HTML, display
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
                          DataCollatorForSeq2Seq, GenerationConfig,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments)

from funcutils import get

os.environ["TOKENIZERS_PARALLELISM"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
assert torch.cuda.is_available()

NUM_TRAIN_EPOCHS = 5
IS_MULTI_CORPUS = True
TASK = 'mt' # 'd2s' or 's2d' or 'mt' pull from argv
MODEL_CKPNT = "t5-base" # t5-small or t5-base
NATURAL_LANGUAGE = "nl"
STRUCTURED_DATA = "sd"
MULTI_CORP = '-multicorp' if IS_MULTI_CORPUS else ""
LR = 2.0e-4
TRAIN_CHKPNT_NAME = f"models/{MODEL_CKPNT}-finetuned-webnlg-{TASK}-{LR:.1e}{MULTI_CORP}"

TARGET = NATURAL_LANGUAGE if TASK == 'd2s' else STRUCTURED_DATA 
INPUT = STRUCTURED_DATA if TASK == 'd2s' else NATURAL_LANGUAGE 
TRAIN_CHKPNT_NAME

'models/t5-base-finetuned-webnlg-mt-2.0e-04-multicorp'

In [2]:
assert TARGET != INPUT
del NATURAL_LANGUAGE
del STRUCTURED_DATA

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPNT)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
max_input_length = 256
max_target_length = 256
tokenize = lambda x: tokenizer(x, max_length = max_input_length, truncation=True, padding=True)
tokenize

<function __main__.<lambda>(x)>

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CKPNT)
model = model.to(device)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [6]:
generation_config = GenerationConfig.from_pretrained(MODEL_CKPNT)
generation_config.min_length = 5
generation_config.num_beams = 4
generation_config.max_length = 2048
generation_config.early_stopping = True
generation_config.no_repeat_ngram_size = 2
generation_config.temperature = .9

In [7]:
batch_size = 64 if MODEL_CKPNT == "t5-small" else 16
# START: ADAPTED FROM https://huggingface.co/docs/transformers/tasks/summarization
args = Seq2SeqTrainingArguments(
    TRAIN_CHKPNT_NAME,
    eval_steps=1500,
    evaluation_strategy = "steps",
    learning_rate=LR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    gradient_accumulation_steps=2 if MODEL_CKPNT != 't5-small' else 1, # so we have an effective batch size of 32
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    save_steps=600,
    generation_config=generation_config,
    generation_max_length=200,
)
# END: ADAPTED FROM https://huggingface.co/docs/transformers/tasks/summarization

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = combine([
    load("rouge"),
])
metric

<evaluate.module.CombinedEvaluations at 0x7f803ed70ca0>

In [9]:
# START: COPIED FROM https://huggingface.co/docs/transformers/tasks/summarization
def compute_metrics(eval_pred):
    # monitor memory and force gc. probably slows us down, probably 
    torchmem = torch.cuda.memory_allocated()
    torchcap = torch.cuda.get_device_properties(0).total_memory

    print(f"torch has allocated {torchmem} of {torchcap}")

    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result
# END: COPIED FROM https://huggingface.co/docs/transformers/tasks/summarization

In [10]:
df = pd.read_pickle("~/repos/nlgs-research/pipeline/normalized_data/webnlg_clean.pkl")

if IS_MULTI_CORPUS:
    df = pd.read_pickle("~/repos/nlgs-research/pipeline/normalized_data/webnlg_wikibio_joint.pkl")
df

Unnamed: 0,subset,category,index,sd,nl
4341,train,SportsTeam,2722,Azerbaijan Premier League|champions|Qarabag FK,[The name of the championship football team in...
3299,train,WikBio,393200000,Dov Sternberg|nationality|American;Dov Sternbe...,[Dov Sternberg is an American karateka.]
9142,train,Politician,7523,Abraham A. Ribicoff|successor|John N. Dempsey;...,[Abraham A Ribicoff was born in the U.S. and w...
1157,dev,City,1157,"Albuquerque, New Mexico|area code|505, 575; Al...","[Albuquerque, in New Mexico, has a total are o..."
1257,dev,Monument,1257,"Adams County, Pennsylvania|has to its west|Fra...",[The 11th Mississippi Infantry Monument is a C...
...,...,...,...,...,...
4404,train,SportsTeam,2785,Massimo Drago|club|S.S.D. Potenza Calcio,[Massimo Drago played for S.S.D. Potenza Calci...
1176,dev,City,1176,"United States|demonym|Americans; Albany, Georg...",[The people inhabiting the United States are k...
9363,train,SportsTeam,7744,A.D. Isidro Metapan|ground|Estadio Jorge Caler...,[A.D. (Asociacion Deportiva) Isidro Metapan pl...
2167,train,WikBio,148200000,David Cooke|occupation|rugby union internation...,[David Cooke is a former a rugby union interna...


 we must invent `seed_number` since d2s can output multiple sentences for the
 same data input. So the seed will be a generation parameter, in the case that
 we are working in a deterministic environment, so generation can vary as
 desired. This computes a cartesian product.

In [11]:
cartesian_sd_nl = []
for (i, subset, cat, indx, sd, nl) in df.itertuples():
    for j, nl_option in enumerate(nl):
        pairing = dict(
            record_idx=i,
            seed_number=j,
            subset=subset,
            category=cat,
            split_index=indx,
            sd=sd,
            nl=nl_option,
            task=TASK if TASK != 'mt' else 's2d' 
        )
        cartesian_sd_nl.append(pairing)
        if TASK == "mt":
            reverse_pair = pairing.copy()
            reverse_pair['sd'] = nl_option
            reverse_pair['nl'] = sd
            reverse_pair['task'] = 'd2s'
            cartesian_sd_nl.append(reverse_pair)

# calling this "flattened" because it no longer has nested records
has_not_run = True
flt = pd.DataFrame(cartesian_sd_nl)
flt

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task
0,4341,0,train,SportsTeam,2722,Azerbaijan Premier League|champions|Qarabag FK,The name of the championship football team in ...,s2d
1,4341,0,train,SportsTeam,2722,The name of the championship football team in ...,Azerbaijan Premier League|champions|Qarabag FK,d2s
2,4341,1,train,SportsTeam,2722,Azerbaijan Premier League|champions|Qarabag FK,Qarabag FK are the champions of the Azerbaijan...,s2d
3,4341,1,train,SportsTeam,2722,Qarabag FK are the champions of the Azerbaijan...,Azerbaijan Premier League|champions|Qarabag FK,d2s
4,4341,2,train,SportsTeam,2722,Azerbaijan Premier League|champions|Qarabag FK,The champions of the Azerbaijan Premier League...,s2d
...,...,...,...,...,...,...,...,...
94865,10126,0,train,Artist,8507,"Alfredo Zitarrosa, born in Uruguay, is a music...",Alfredo Zitarrosa|record label|RCA Records; Al...,d2s
94866,10126,1,train,Artist,8507,Alfredo Zitarrosa|record label|RCA Records; Al...,Singer Alfredo Zitarrosa is associated with Ta...,s2d
94867,10126,1,train,Artist,8507,Singer Alfredo Zitarrosa is associated with Ta...,Alfredo Zitarrosa|record label|RCA Records; Al...,d2s
94868,10126,2,train,Artist,8507,Alfredo Zitarrosa|record label|RCA Records; Al...,"Alfredo Zitarrosa, born in Uruguay, plays Taqu...",s2d


In [12]:
# prepend the seed number. This should be rt of the prompt hereafter for `d2s`
# tasks. So, prompting with two different numbers should never generate the same
# output.

if (TASK == "mt") and has_not_run:
    has_not_run = False
    flt['sd'] = flt.task + flt.seed_number.map(lambda x: " " + str(x) + ": ") + flt.sd

    # allow the model to code switch between corpora
    if IS_MULTI_CORPUS:
        flt['sd'] = flt.category.map(lambda x: 'wb' if x == 'WikiBio' else "") + flt.sd
flt

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task
0,4341,0,train,SportsTeam,2722,s2d 0: Azerbaijan Premier League|champions|Qar...,The name of the championship football team in ...,s2d
1,4341,0,train,SportsTeam,2722,d2s 0: The name of the championship football t...,Azerbaijan Premier League|champions|Qarabag FK,d2s
2,4341,1,train,SportsTeam,2722,s2d 1: Azerbaijan Premier League|champions|Qar...,Qarabag FK are the champions of the Azerbaijan...,s2d
3,4341,1,train,SportsTeam,2722,d2s 1: Qarabag FK are the champions of the Aze...,Azerbaijan Premier League|champions|Qarabag FK,d2s
4,4341,2,train,SportsTeam,2722,s2d 2: Azerbaijan Premier League|champions|Qar...,The champions of the Azerbaijan Premier League...,s2d
...,...,...,...,...,...,...,...,...
94865,10126,0,train,Artist,8507,"d2s 0: Alfredo Zitarrosa, born in Uruguay, is ...",Alfredo Zitarrosa|record label|RCA Records; Al...,d2s
94866,10126,1,train,Artist,8507,s2d 1: Alfredo Zitarrosa|record label|RCA Reco...,Singer Alfredo Zitarrosa is associated with Ta...,s2d
94867,10126,1,train,Artist,8507,d2s 1: Singer Alfredo Zitarrosa is associated ...,Alfredo Zitarrosa|record label|RCA Records; Al...,d2s
94868,10126,2,train,Artist,8507,s2d 2: Alfredo Zitarrosa|record label|RCA Reco...,"Alfredo Zitarrosa, born in Uruguay, plays Taqu...",s2d


In [13]:
tokenized = tokenize(list(flt[INPUT].values))

 !!Heads-up!! The following fields comprise the "interface" of the model,
 despite the fact the documentation doesn't make this obvious. Without these
 particular names, ['input_ids', 'attention_mask', 'labels'],
 the model will not train and provide cryptic error messges.

In [14]:
flt['input_ids'] = tokenized['input_ids']
flt['attention_mask'] = tokenized['attention_mask']
flt['labels'] = flt[TARGET].map(lambda x: tokenize(x)['input_ids'])
flt['input_ids'].map(len)

0        256
1        256
2        256
3        256
4        256
        ... 
94865    256
94866    256
94867    256
94868    256
94869    256
Name: input_ids, Length: 94870, dtype: int64

In [15]:
flt

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task,input_ids,attention_mask,labels
0,4341,0,train,SportsTeam,2722,s2d 0: Azerbaijan Premier League|champions|Qar...,The name of the championship football team in ...,s2d,"[37, 564, 13, 8, 10183, 3370, 372, 16, 8, 71, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 357, 26, 3, 632, 10, 71, 2558, 9441, 70..."
1,4341,0,train,SportsTeam,2722,d2s 0: The name of the championship football t...,Azerbaijan Premier League|champions|Qarabag FK,d2s,"[71, 2558, 9441, 7066, 6552, 3815, 9175, 17788...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 26, 357, 7, 3, 632, 10, 37, 564, 13, 8, 10..."
2,4341,1,train,SportsTeam,2722,s2d 1: Azerbaijan Premier League|champions|Qar...,Qarabag FK are the champions of the Azerbaijan...,s2d,"[1593, 2551, 7893, 377, 439, 33, 8, 6336, 7, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 357, 26, 209, 10, 71, 2558, 9441, 7066,..."
3,4341,1,train,SportsTeam,2722,d2s 1: Qarabag FK are the champions of the Aze...,Azerbaijan Premier League|champions|Qarabag FK,d2s,"[71, 2558, 9441, 7066, 6552, 3815, 9175, 17788...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 26, 357, 7, 209, 10, 1593, 2551, 7893, 377..."
4,4341,2,train,SportsTeam,2722,s2d 2: Azerbaijan Premier League|champions|Qar...,The champions of the Azerbaijan Premier League...,s2d,"[37, 6336, 7, 13, 8, 71, 2558, 9441, 7066, 655...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 357, 26, 204, 10, 71, 2558, 9441, 7066,..."
...,...,...,...,...,...,...,...,...,...,...,...
94865,10126,0,train,Artist,8507,"d2s 0: Alfredo Zitarrosa, born in Uruguay, is ...",Alfredo Zitarrosa|record label|RCA Records; Al...,d2s,"[19850, 32, 3969, 2046, 1859, 9, 9175, 60, 762...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 26, 357, 7, 3, 632, 10, 19850, 32, 3969, 2..."
94866,10126,1,train,Artist,8507,s2d 1: Alfredo Zitarrosa|record label|RCA Reco...,Singer Alfredo Zitarrosa is associated with Ta...,s2d,"[24366, 19850, 32, 3969, 2046, 1859, 9, 19, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 357, 26, 209, 10, 19850, 32, 3969, 2046..."
94867,10126,1,train,Artist,8507,d2s 1: Singer Alfredo Zitarrosa is associated ...,Alfredo Zitarrosa|record label|RCA Records; Al...,d2s,"[19850, 32, 3969, 2046, 1859, 9, 9175, 60, 762...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 26, 357, 7, 209, 10, 24366, 19850, 32, 396..."
94868,10126,2,train,Artist,8507,s2d 2: Alfredo Zitarrosa|record label|RCA Reco...,"Alfredo Zitarrosa, born in Uruguay, plays Taqu...",s2d,"[19850, 32, 3969, 2046, 1859, 9, 6, 2170, 16, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 357, 26, 204, 10, 19850, 32, 3969, 2046..."


In [16]:
# this will keep only the needed fields in memory on the GPU
def pd_to_dataset(df: pd.DataFrame, split='train') -> Dataset:
    print(df)
    d = df[df.subset== split][['input_ids','attention_mask','labels']]
    return Dataset.from_pandas(d)
        
# get_ds alias should bake in the desired argument. Makes you wish python
# supported currying
get_ds = lambda x: pd_to_dataset(flt, x)
tds = get_ds('train')
eds = get_ds('dev')
tds

       record_idx  seed_number subset    category  split_index  \
0            4341            0  train  SportsTeam         2722   
1            4341            0  train  SportsTeam         2722   
2            4341            1  train  SportsTeam         2722   
3            4341            1  train  SportsTeam         2722   
4            4341            2  train  SportsTeam         2722   
...           ...          ...    ...         ...          ...   
94865       10126            0  train      Artist         8507   
94866       10126            1  train      Artist         8507   
94867       10126            1  train      Artist         8507   
94868       10126            2  train      Artist         8507   
94869       10126            2  train      Artist         8507   

                                                      sd  \
0      s2d 0: Azerbaijan Premier League|champions|Qar...   
1      d2s 0: The name of the championship football t...   
2      s2d 1: Azerbaijan Pr

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
    num_rows: 77790
})

In [17]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tds,
    eval_dataset=eds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [18]:
# we must try-catch because resume_from_checkpoint throws a value error (for
# some reason instead of raising a warning) if training did not begin first.
try:
    trainer.train(resume_from_checkpoint=True)
except ValueError as e:
    print(e)
    trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


No valid checkpoint found in output directory (models/t5-base-finetuned-webnlg-mt-2.0e-04-multicorp)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1500,0.5728,0.328671,0.827012,0.64889,0.697634,0.723099,44.775834
3000,0.47,0.291974,0.839568,0.668991,0.718082,0.743445,44.18443
4500,0.4476,0.273844,0.846924,0.680602,0.733458,0.756057,44.451344
6000,0.4157,0.280615,0.848642,0.683357,0.736946,0.760404,44.544486




torch has allocated 2733953536 of 25447170048
torch has allocated 2733953536 of 25447170048
torch has allocated 2733196800 of 25447170048
torch has allocated 2732511744 of 25447170048


In [None]:
if False:
    trainer.push_to_hub()

In [None]:
try:
    del tds
    del eds
    del tds
except Exception as e:
    print(e)

In [None]:
tds = get_ds('test')
# debug = Dataset.from_dict(tds[0:2])
predictions = trainer.predict(tds)
predictions

In [None]:
flat_keep_positive = lambda x: [e for e in x if e > 1]
pred_df = pd.DataFrame(columns=['pred_ids'], data=pd.Series(list(predictions.predictions)))
decoded = pred_df.pred_ids.map(flat_keep_positive).map(tokenizer.decode)
pred_df['decoded'] = decoded
pred_df['subset'] = 'test'
pred_df

In [None]:
test_set = flt[flt.subset == 'test'].copy()
test_set['pred_ids'] = list(pred_df['pred_ids'].values)
test_set['decoded'] = list(pred_df['decoded'].values)
test_set

In [None]:
save_fname = f"~/repos/nlgs-research/pipeline/predictions/{TASK}-{MODEL_CKPNT}-{NUM_TRAIN_EPOCHS}{MULTI_CORP}.pkl"
test_set.to_pickle(save_fname)
save_fname

 ## Sanity Checks

In [None]:
def text_to_prediction_single(text):
    tensors = tokenizer("<pad>" + text + "</s>", return_tensors='pt').to(device)['input_ids']
    generation = trainer.model.generate(tensors,
        early_stopping=True,
        num_beams=5,
        max_new_tokens=1024,
        temperature=.9,
    ) 
    return tokenizer.decode(generation[0], skip_special_tokens=True)

t = "The leader of Aarhus is Jacob Bundsgaard."
text_to_prediction_single(t)

In [None]:
print("\n".join(map(tokenizer.decode,
                np.where(predictions.predictions != -100, predictions.predictions, tokenizer.pad_token_id)
                )))

In [None]:
if TASK == "mt":
    print("\n".join(map(text_to_prediction_single, [
        'd2s 0: Aarhus|leader name|Jacob Bundsgaard',
        'd2s 1: Aarhus|leader name|Jacob Bundsgaard',
        "d2s 0: United States|leader name|Barack Obama ",
        's2d 0: The leader of Aarhus is Jacob Bundsgaard.',
        "s2d 0: Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
        "s2d 1: Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
    ])))
else:
    print("\n".join(map(text_to_prediction_single, [
        'Aarhus|leader name|Jacob Bundsgaard',
        'Aarhus|leader name|Jacob Bundsgaard',
        "United States|leader name|Barack Obama ",
        'The leader of Aarhus is Jacob Bundsgaard.',
        "Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
        "Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
    ])))