In [1]:
import os
import gc
import random
from pathlib import Path

import datasets
import nltk
import numpy as np
import pandas as pd
import torch
import transformers

from datasets import Dataset, concatenate_datasets
from evaluate import combine, load
from functional import seq
from huggingface_hub import notebook_login
from IPython.display import HTML, display
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
                          DataCollatorForSeq2Seq, GenerationConfig,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments)

from funcutils import get

os.environ["TOKENIZERS_PARALLELISM"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
assert torch.cuda.is_available()

NUM_TRAIN_EPOCHS = 5
TASK = 'd2s' # 'd2s' or 's2d' or 'mt' pull from argv
MODEL_CKPNT = "t5-base" # t5-small or t5-base
NATURAL_LANGUAGE = "nl"
STRUCTURED_DATA = "sd"
LR = 2.0e-4
TRAIN_CHKPNT_NAME = f"models/{MODEL_CKPNT}-finetuned-webnlg-{TASK}-{LR:.1e}"

TARGET = NATURAL_LANGUAGE if TASK == 'd2s' else STRUCTURED_DATA 
INPUT = STRUCTURED_DATA if TASK == 'd2s' else NATURAL_LANGUAGE 

In [2]:
assert TARGET != INPUT
del NATURAL_LANGUAGE
del STRUCTURED_DATA

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPNT)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
max_input_length = 256
max_target_length = 256
tokenize = lambda x: tokenizer(x, max_length = max_input_length, truncation=True, padding=True)
tokenize

<function __main__.<lambda>(x)>

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CKPNT)
model = model.to(device)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [6]:
generation_config = GenerationConfig.from_pretrained(MODEL_CKPNT)
generation_config.min_length = 5
generation_config.num_beams = 4
generation_config.max_length = 2048
generation_config.early_stopping = True
generation_config.no_repeat_ngram_size = 2
generation_config.temperature = .9

In [7]:
batch_size = 64 if MODEL_CKPNT == "t5-small" else 16
# START: ADAPTED FROM https://huggingface.co/docs/transformers/tasks/summarization
args = Seq2SeqTrainingArguments(
    TRAIN_CHKPNT_NAME,
    eval_steps=1500,
    evaluation_strategy = "steps",
    learning_rate=LR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    save_steps=600,
    generation_config=generation_config,
    generation_max_length=200,
)
# END: ADAPTED FROM https://huggingface.co/docs/transformers/tasks/summarization

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = combine([
    load("rouge"),
])
metric

<evaluate.module.CombinedEvaluations at 0x7fbb202602e0>

In [9]:
# START: COPIED FROM https://huggingface.co/docs/transformers/tasks/summarization
def compute_metrics(eval_pred):
    # monitor memory and force gc. probably slows us down, probably 
    torchmem = torch.cuda.memory_allocated()
    torchcap = torch.cuda.get_device_properties(0).total_memory

    print(f"torch has allocated {torchmem} of {torchcap}")

    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result
# END: COPIED FROM https://huggingface.co/docs/transformers/tasks/summarization

In [10]:
df = pd.read_pickle("~/repos/nlgs-research/pipeline/normalized_data/webnlg_clean.pkl")
df

Unnamed: 0,subset,category,index,sd,nl
0,dev,Airport,0,Aarhus|leader name|Jacob Bundsgaard,[The leader of Aarhus is Jacob Bundsgaard.]
1,dev,Airport,1,Aarhus Airport|runway length|2702.0,"[Aarhus Airport's runway length is 2702.0., Th..."
2,dev,Airport,2,Adirondack Regional Airport|elevation above th...,[Adirondack Regional Airport is 507 metres abo...
3,dev,Airport,3,Adirondack Regional Airport|location|Harrietst...,[Adirondack Regional airport is located at Har...
4,dev,Airport,4,Adolfo Suarez Madrid-Barajas Airport|location|...,[Adolfo Suarez Madrid-Barajas Airport is found...
...,...,...,...,...,...
16090,test,University,1595,Acharya Institute of Technology|was given the ...,[The Acharya Institute of Technology in Karnat...
16091,test,University,1596,Romania|ethnic group|Germans of Romania; Roman...,[The Germans of Romania are one of the ethnic ...
16092,test,University,1597,School of Business and Social Sciences at the ...,[The School of Business and Social Sciences at...
16093,test,University,1598,School of Business and Social Sciences at the ...,[Denmark is led by the Monarchy of Demark and ...


 we must invent `seed_number` since d2s can output multiple sentences for the
 same data input. So the seed will be a generation parameter, in the case that
 we are working in a deterministic environment, so generation can vary as
 desired. This computes a cartesian product.

In [11]:
cartesian_sd_nl = []
for (i, subset, cat, indx, sd, nl) in df.itertuples():
    for j, nl_option in enumerate(nl):
        pairing = dict(
            record_idx=i,
            seed_number=j,
            subset=subset,
            category=cat,
            split_index=indx,
            sd=sd,
            nl=nl_option,
            task=TASK if TASK != 'mt' else 's2d' 
        )
        cartesian_sd_nl.append(pairing)
        if TASK == "mt":
            reverse_pair = pairing.copy()
            reverse_pair['sd'] = nl_option
            reverse_pair['nl'] = sd
            reverse_pair['task'] = 'd2s'
            cartesian_sd_nl.append(reverse_pair)

# calling this "flattened" because it no longer has nested records
has_not_run = True
flt = pd.DataFrame(cartesian_sd_nl)
flt

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task
0,0,0,dev,Airport,0,Aarhus|leader name|Jacob Bundsgaard,The leader of Aarhus is Jacob Bundsgaard.,d2s
1,1,0,dev,Airport,1,Aarhus Airport|runway length|2702.0,Aarhus Airport's runway length is 2702.0.,d2s
2,1,1,dev,Airport,1,Aarhus Airport|runway length|2702.0,The Aarhus Airport has a runway length of 2702.0.,d2s
3,2,0,dev,Airport,2,Adirondack Regional Airport|elevation above th...,Adirondack Regional Airport is 507 metres abov...,d2s
4,3,0,dev,Airport,3,Adirondack Regional Airport|location|Harrietst...,Adirondack Regional airport is located at Harr...,d2s
...,...,...,...,...,...,...,...,...
42887,16092,1,test,University,1597,School of Business and Social Sciences at the ...,"Established in 1928, the School of Business an...",d2s
42888,16093,0,test,University,1598,School of Business and Social Sciences at the ...,Denmark is led by the Monarchy of Demark and t...,d2s
42889,16093,1,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s
42890,16093,2,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s


In [12]:
# prepend the seed number. This should be rt of the prompt hereafter for `d2s`
# tasks. So, prompting with two different numbers should never generate the same
# output.

if (TASK == "mt") and has_not_run:
    has_not_run = False
    flt['sd'] = flt.task + flt.seed_number.map(lambda x: " " + str(x) + ": ") + flt.sd
flt

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task
0,0,0,dev,Airport,0,Aarhus|leader name|Jacob Bundsgaard,The leader of Aarhus is Jacob Bundsgaard.,d2s
1,1,0,dev,Airport,1,Aarhus Airport|runway length|2702.0,Aarhus Airport's runway length is 2702.0.,d2s
2,1,1,dev,Airport,1,Aarhus Airport|runway length|2702.0,The Aarhus Airport has a runway length of 2702.0.,d2s
3,2,0,dev,Airport,2,Adirondack Regional Airport|elevation above th...,Adirondack Regional Airport is 507 metres abov...,d2s
4,3,0,dev,Airport,3,Adirondack Regional Airport|location|Harrietst...,Adirondack Regional airport is located at Harr...,d2s
...,...,...,...,...,...,...,...,...
42887,16092,1,test,University,1597,School of Business and Social Sciences at the ...,"Established in 1928, the School of Business an...",d2s
42888,16093,0,test,University,1598,School of Business and Social Sciences at the ...,Denmark is led by the Monarchy of Demark and t...,d2s
42889,16093,1,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s
42890,16093,2,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s


In [13]:
tokenized = tokenize(list(flt[INPUT].values))

 !!Heads-up!! The following fields comprise the "interface" of the model,
 despite the fact the documentation doesn't make this obvious. Without these
 particular names, ['input_ids', 'attention_mask', 'labels'],
 the model will not train and provide cryptic error messges.

In [14]:
flt['input_ids'] = tokenized['input_ids']
flt['attention_mask'] = tokenized['attention_mask']
flt['labels'] = flt[TARGET].map(lambda x: tokenize(x)['input_ids'])
flt['input_ids'].map(len)

0        184
1        184
2        184
3        184
4        184
        ... 
42887    184
42888    184
42889    184
42890    184
42891    184
Name: input_ids, Length: 42892, dtype: int64

In [15]:
# this will keep only the needed fields in memory on the GPU
def pd_to_dataset(df: pd.DataFrame, split='train') -> Dataset:
    d = df[df.subset== split ][['input_ids','attention_mask','labels']]
    return Dataset.from_pandas(d)
        
# get_ds alias should bake in the desired argument. Makes you wish python
# supported currying
get_ds = lambda x: pd_to_dataset(flt, x)
tds = get_ds('train')
eds = get_ds('dev')

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tds,
    eval_dataset=eds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [17]:
# we must try-catch because resume_from_checkpoint throws a value error (for
# some reason instead of raising a warning) if training did not begin first.
try:
    trainer.train(resume_from_checkpoint=True)
except ValueError as e:
    print(e)
    trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


No valid checkpoint found in output directory (models/t5-base-finetuned-webnlg-d2s-2.0e-04)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1500,0.7533,0.61853,0.770957,0.540883,0.621237,0.667477,30.652224
3000,0.6245,0.573596,0.779671,0.55071,0.626526,0.672554,31.495829
4500,0.5848,0.55276,0.781921,0.553201,0.630053,0.67375,30.927016
6000,0.5496,0.53459,0.785272,0.560538,0.637784,0.683341,31.05607
7500,0.693,0.693579,0.763529,0.528014,0.612542,0.661772,30.55051
9000,3.1635,2.152415,0.29312,0.178663,0.26085,0.263785,124.71177
10500,3.1442,2.118665,0.283593,0.172058,0.252139,0.254687,128.824606




torch has allocated 2718500352 of 25447170048
torch has allocated 2718500352 of 25447170048
torch has allocated 2718500352 of 25447170048
torch has allocated 2718500352 of 25447170048
torch has allocated 2718500352 of 25447170048
torch has allocated 2721875456 of 25447170048
torch has allocated 2721875456 of 25447170048


In [18]:
try:
    del tds
    del eds
    del tds
except Exception as e:
    print(e)

name 'tds' is not defined


In [19]:
tds = get_ds('test')
# debug = Dataset.from_dict(tds[0:2])
predictions = trainer.predict(tds)
predictions

torch has allocated 2721873920 of 25447170048


PredictionOutput(predictions=array([[   0,  891,   23, ..., -100, -100, -100],
       [   0,  891,   23, ..., -100, -100, -100],
       [   0, 1980,   32, ..., -100, -100, -100],
       ...,
       [   0,   37, 1121, ..., 1069, 1069, 1069],
       [   0,   37, 1121, ..., 1069, 1069, 1069],
       [   0,   37, 1121, ..., 1069, 1069, 1069]]), label_ids=array([[  891,    23, 14205, ...,  -100,  -100,  -100],
       [  891,    23, 14205, ...,  -100,  -100,  -100],
       [ 1980,    32,    40, ...,  -100,  -100,  -100],
       ...,
       [   37,  1121,    13, ...,  -100,  -100,  -100],
       [   37,  1121,    13, ...,  -100,  -100,  -100],
       [   37,  1121,    13, ...,  -100,  -100,  -100]]), metrics={'test_loss': 2.118274688720703, 'test_rouge1': 0.29694476847401674, 'test_rouge2': 0.18534019546230235, 'test_rougeL': 0.2666047535055997, 'test_rougeLsum': 0.26999764379145086, 'test_gen_len': 125.05255681818181, 'test_runtime': 989.5563, 'test_samples_per_second': 4.269, 'test_steps_pe

In [20]:
flat_keep_positive = lambda x: [e for e in x if e > 1]
pred_df = pd.DataFrame(columns=['pred_ids'], data=pd.Series(list(predictions.predictions)))
decoded = pred_df.pred_ids.map(flat_keep_positive).map(tokenizer.decode)
pred_df['decoded'] = decoded
pred_df['subset'] = 'test'
pred_df

Unnamed: 0,pred_ids,decoded,subset
0,"[0, 891, 23, 14205, 7676, 5735, 4657, 8, 690, ...",Abilene Regional Airport serves the city of Ab...,test
1,"[0, 891, 23, 14205, 7676, 5735, 4657, 8, 690, ...",Abilene Regional Airport serves the city of Ab...,test
2,"[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...,test
3,"[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...,test
4,"[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...,test
...,...,...,...
4219,"[0, 37, 1121, 13, 1769, 11, 2730, 9226, 636, 1...",The School of Business and Social Sciences Uni...,test
4220,"[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...,test
4221,"[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...,test
4222,"[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...,test


In [21]:
test_set = flt[flt.subset == 'test'].copy()
test_set['pred_ids'] = list(pred_df['pred_ids'].values)
test_set['decoded'] = list(pred_df['decoded'].values)
test_set

Unnamed: 0,record_idx,seed_number,subset,category,split_index,sd,nl,task,input_ids,attention_mask,labels,pred_ids,decoded
38668,14495,0,test,Airport,0,"Abilene Regional Airport|city served|Abilene, ...","Abilene, Texas is served by the Abilene region...",d2s,"[891, 23, 14205, 7676, 5735, 9175, 6726, 2098,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[891, 23, 14205, 6, 2514, 19, 2098, 57, 8, 891...","[0, 891, 23, 14205, 7676, 5735, 4657, 8, 690, ...",Abilene Regional Airport serves the city of Ab...
38669,14495,1,test,Airport,0,"Abilene Regional Airport|city served|Abilene, ...",Abilene Regional Airport serves the city of Ab...,d2s,"[891, 23, 14205, 7676, 5735, 9175, 6726, 2098,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[891, 23, 14205, 7676, 5735, 4657, 8, 690, 13,...","[0, 891, 23, 14205, 7676, 5735, 4657, 8, 690, ...",Abilene Regional Airport serves the city of Ab...
38670,14496,0,test,Airport,1,Adolfo Suarez Madrid-Barajas Airport|location|...,Adolfo Suarez Madrid-Barajas Airport can be fo...,d2s,"[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...
38671,14496,1,test,Airport,1,Adolfo Suarez Madrid-Barajas Airport|location|...,Adolfo Suarez Madrid-Barajas airport is locate...,d2s,"[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...
38672,14496,2,test,Airport,1,Adolfo Suarez Madrid-Barajas Airport|location|...,Adolfo Suarez Madrid-Barajas Airport is locate...,d2s,"[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1980, 32, 40, 89, 32, 1923, 9, 2638, 12033, 1...","[0, 1980, 32, 40, 89, 32, 1923, 9, 2638, 12033...",Adolfo Suarez Madrid-Barajas Airport is locate...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42887,16092,1,test,University,1597,School of Business and Social Sciences at the ...,"Established in 1928, the School of Business an...",d2s,"[1121, 13, 1769, 11, 2730, 9226, 44, 8, 71, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[25275, 16, 29004, 6, 8, 1121, 13, 1769, 11, 2...","[0, 37, 1121, 13, 1769, 11, 2730, 9226, 636, 1...",The School of Business and Social Sciences Uni...
42888,16093,0,test,University,1598,School of Business and Social Sciences at the ...,Denmark is led by the Monarchy of Demark and t...,d2s,"[1121, 13, 1769, 11, 2730, 9226, 44, 8, 71, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[18001, 19, 2237, 57, 8, 2963, 7064, 63, 13, 3...","[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...
42889,16093,1,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s,"[1121, 13, 1769, 11, 2730, 9226, 44, 8, 71, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 1121, 13, 1769, 11, 2730, 9226, 44, 8, 71...","[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...
42890,16093,2,test,University,1598,School of Business and Social Sciences at the ...,The School of Business and Social Sciences at ...,d2s,"[1121, 13, 1769, 11, 2730, 9226, 44, 8, 71, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 1121, 13, 1769, 11, 2730, 9226, 44, 8, 71...","[0, 37, 1121, 13, 1769, 11, 2730, 9226, 1121, ...",The School of Business and Social Sciences Sch...


In [22]:
save_fname = f"~/repos/nlgs-research/pipeline/predictions/{TASK}-{MODEL_CKPNT}-{NUM_TRAIN_EPOCHS}.pkl"
test_set.to_pickle(save_fname)
save_fname

'~/repos/nlgs-research/pipeline/predictions/d2s-t5-base-5.pkl'

 ## Sanity Checks

In [23]:
def text_to_prediction_single(text):
    tensors = tokenizer("<pad>" + text + "</s>", return_tensors='pt').to(device)['input_ids']
    generation = trainer.model.generate(tensors,
        early_stopping=True,
        num_beams=5,
        max_new_tokens=1024,
        temperature=.9,
    ) 
    return tokenizer.decode(generation[0], skip_special_tokens=True)

t = "The leader of Aarhus is Jacob Bundsgaard."
text_to_prediction_single(t)

'Jacob Bundsgaard is the leader of Aarhus.'

In [24]:
print("\n".join(map(tokenizer.decode,
                np.where(predictions.predictions != -100, predictions.predictions, tokenizer.pad_token_id)
                )))

<pad> Abilene Regional Airport serves the city of Abilene Texas.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<pad> Abile

In [25]:
if TASK == "mt":
    print("\n".join(map(text_to_prediction_single, [
        'd2s 0: Aarhus|leader name|Jacob Bundsgaard',
        'd2s 1: Aarhus|leader name|Jacob Bundsgaard',
        "d2s 0: United States|leader name|Barack Obama ",
        's2d 0: The leader of Aarhus is Jacob Bundsgaard.',
        "s2d 0: Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
        "s2d 1: Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
    ])))
else:
    print("\n".join(map(text_to_prediction_single, [
        'Aarhus|leader name|Jacob Bundsgaard',
        'Aarhus|leader name|Jacob Bundsgaard',
        "United States|leader name|Barack Obama ",
        'The leader of Aarhus is Jacob Bundsgaard.',
        "Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
        "Linus Torvalds was born in Helsinki, Finland. He is the son of journalists Anna and Nils Torvalds",
    ])))

The leader of Aarhus is Jacob Bundsgaard.
The leader of Aarhus is Jacob Bundsgaard.
Barack Obama is the leader of the United States.
Jacob Bundsgaard is the leader of Aarhus.
Linus Torvalds was born in Helsinki, Finland.
Linus Torvalds was born in Helsinki, Finland.
