# eval

In [2]:
#!/usr/bin/env python3
import nlp
from datasets import load_dataset
from transformers import BertTokenizer, EncoderDecoderModel
tokenizer = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
model.to("cuda")
test_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="test")
# test_dataset = load_dataset('cnn_dailymail', "3.0.0", split='test', ignore_verifications=True)

batch_size = 128
# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch
results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
# load rouge for validation
rouge = nlp.load_metric("rouge")
pred_str = results["pred"]
label_str = results["highlights"]
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
print(rouge_output)

Setting `pad_token_id` to 102 (first `eos_token_id`) to generate sequence


Score(precision=0.16888957961119538, recall=0.1740076658191007, fmeasure=0.1668065567031357)


# Error cases

In [1]:
from transformers import EncoderDecoderConfig, EncoderDecoderModel, AutoTokenizer

# encoder_decoder_config = EncoderDecoderConfig.from_pretrained('./models/bert2bert/2/checkpoint-4500')
# bert2bert_model = EncoderDecoderModel.from_pretrained('./models/bert2bert/2/checkpoint-4500', config=encoder_decoder_config)
# bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


encoder_decoder_config = EncoderDecoderConfig.from_pretrained('./models/bertweet2bertweet/2/checkpoint-3500')
model = EncoderDecoderModel.from_pretrained('./models/bertweet2bertweet/2/checkpoint-3500', config=encoder_decoder_config)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [3]:
import pandas as pd
from tweetNormalizer import normalizeTweet

# load train and validation data
train_dataset = pd.read_json("WNUT2015_dataset/train_data.json", orient="records")
val_dataset = pd.read_json("WNUT2015_dataset/test_truth.json", orient="records")

#make_sentence = lambda x : " ".join(x)
make_sentence = lambda x : normalizeTweet(" ".join(x)).lower()

train_dataset['input_sentence'] = train_dataset['input'].apply(make_sentence)
train_dataset['output_sentence'] = train_dataset['output'].apply(make_sentence)
val_dataset['input_sentence'] = val_dataset['input'].apply(make_sentence)
val_dataset['output_sentence'] = val_dataset['output'].apply(make_sentence)

In [4]:
for i in range(0, 20):
#     print("example"+str(i))
    tweet_text = val_dataset.iloc[i]['input_sentence']
    normal_english = val_dataset.iloc[i]['output_sentence']

    input_ids = tokenizer(tweet_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids)

    print("tweet: ", tweet_text)
    print("model: ", tokenizer.decode(output_ids[0], skip_special_tokens=True))
    print("label: ", normal_english)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  @user yeh but still that's wild lol
model:  @user yeah but still that's wild laughing out loud
label:  @user yeah but still that's wild laughing out loud


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  dick in janice , im poppin xanax and speakin spanish .
model:  dick in janice, i'm popping xanax and talking about miami.
label:  dick in janice , i'm popping xanax and speaking spanish .


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  ucsb i fear the next rampage will b cuz media told us everything #elliotrodger ever diid . to get ratings . #tcot #notonemore #knifesense
model:  uci believe i swear the ultimate will be home because everyone told us #lierot ever died to get diors. #nototheshire #no#mekenekenekkenene
label:  ucsb i fear the next rampage will because media told us everything #elliotrodger ever did . to get ratings . #tcot #notonemore #knifesense


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : @user @user @user not even gays are gonna look to u lmao
model:  rt @user : @user @user not even gay gay are gonna look to you laughing my ass off
label:  rt @user : @user @user @user not even gays are gonna look to you laughing my ass off


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  @user @user @user card in 5 mins is with wh but this was the game so far bk lol but how to find who the ref is
model:  @user @user laughing out loud card in 5 is with what is this but this was so big back to front but how to find how find the person is
label:  @user @user @user card in 5 minutes is with wh but this was the game so far bk laughing out loud but how to find who the ref is


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  @user haha oh okk ! yeaah , they've been dating for a while now ! ! they seem crazy cute ! ! !
model:  @user haha oh okay! yeah they's definitely had a million while for a lot now! they really cute!!
label:  @user haha oh okay ! yeah , they've been dating for a while now ! ! they seem crazy cute ! ! !


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  samsung working on oculus rift-like vr headsets for galaxy devices : report - firstpost httpurl
model:  samsung working on ococro like t-ved headtions for galaxy comments : first - httpurl
label:  samsung working on oculus rift-like vr headsets for galaxy devices : report - firstpost httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : best snapchat i've seen all day lmfao httpurl
model:  rt @user : best snapchat i've seen all day laughing my ass off httpurl
label:  rt @user : best snapchat i've seen all day laughing my fucking ass off httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  legends indeed rt @user : feela sistah spoken word collective ... #legendsofsapoetry httpurl
model:  legendary indeed rt @user : favaa a sekword word... #bringealafoodsa httpurl
label:  legends indeed rt @user : feels sistah spoken word collective ... #legendsofsapoetry httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  thisnewssoccer : adam lallana pilih tottenham hotspur ketimbang liverpool httpurl #thisisfootball_id
model:  this ssssecarro : adam llihaal to hihtottenham #nobangbangparis httpurl #artist@@
label:  this news soccer : adam lallana pilih tottenham hotspur ketimbang liverpool httpurl #thisisfootball_id


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  enter to #win a prize pack of lindor truffles and lindor sticks ( arcv $ 52 ) from lindt & @user ! #giveaway httpurl
model:  invite to win a prize of cup or ffle and lindffle or sticks ( 15 ) $ 15 from codet & @user giveaway! httpurl
label:  enter to #win a prize pack of lindor truffles and lindor sticks ( arcv $ 52 ) from lindt & @user ! #giveaway httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  httpurl #joy " ... #suffering is the result of an aggressive mind . " . pema chodron
model:  httpurl #lovely... " #sadness is the results of an attack. ". #pms
label:  httpurl #joy " ... #suffering is the result of an aggressive mind . " . pema chodron


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : we r doin a fucken movie with lionsgate how did we even get here from barking on a train bahahahaha
model:  rt @user : we are doing a fucking movie with lioguards what we even got here from bouncing on a bikini hahahaha
label:  rt @user : we are doing a fucking movie with lionsgate how did we even get here from barking on a train bahahahaha


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  lol . don't act cute here pls
model:  laughing out loud. don't act cute here please
label:  laughing out loud . don't act cute here please


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : idek why i bother ... smh
model:  rt @user : i don't know why i worry... shaking my head
label:  rt @user : i don't even know why i bother ... shaking my head


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : [ hq ] 140526 xiumin , luhan @user mbc idol futsal championship ( cr . shade of the bloom ) httpurl
model:  rt @user : [ hq ] 140526 xiumin, luhan @user mbc idol futsal championship ( cr : the lovely ) httpurl
label:  rt @user : [ hq ] 140526 xiumin , luhan @user mbc idol futsal championship ( cr . shade of the bloom ) httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  @user ayy skiperooo wassup cuz what u mean by marlton ?
model:  @user alright scooperoo what's what because you mean marlton?
label:  @user ayy skiperooo what's up because what you mean by marlton ?


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  rt @user : sean , darragh and i , in croke park ! unbelievable weekend ! httpurl
model:  rt @user : sean, darragh and i, in croke park! unbelievable weekend! httpurl
label:  rt @user : sean , darragh and i , in croke park ! unbelievable weekend ! httpurl


Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  @user umm i thought that convo was pretty funny soooooo
model:  @user umm i thought that conversation was pretty funny so
label:  @user umm i thought that conversation was pretty funny so
tweet:  stop be so fucken gay you three aint nobody got time for that @user @user @user
model:  stop be so fucking gay you three ain't got nobody got time for that @user @user
label:  stop be so fucking gay you three ain't nobody got time for that @user @user @user


In [7]:
tweet_text = "WE R DOIN A FUCKEN MOVIE, it's so bad. lol."

input_ids = tokenizer(tweet_text, return_tensors="pt").input_ids
output_ids = model.generate(input_ids)

print("tweet: ", tweet_text)
print("model: ", tokenizer.decode(output_ids[0], skip_special_tokens=True))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


tweet:  WE R DOIN A FUCKEN movie, it's so bad. lol.
model:  we are doing a fucking movie e's, it's so bad. laughing out loud


# pre-pocessing

In [1]:
import pandas as pd
from tweetNormalizer import normalizeTweet
import re

# load train and validation data
train_dataset = pd.read_json("WNUT2015_dataset/train_data.json", orient="records")
val_dataset = pd.read_json("WNUT2015_dataset/test_truth.json", orient="records")

train_dataset = train_dataset[:100]
val_dataset = val_dataset[:5]


make_sentence = lambda x : normalizeTweet(" ".join(x))

train_dataset['input_sentence'] = train_dataset['input'].apply(make_sentence)
train_dataset['output_sentence'] = train_dataset['output'].apply(make_sentence)
val_dataset['input_sentence'] = val_dataset['input'].apply(make_sentence)
val_dataset['output_sentence'] = val_dataset['output'].apply(make_sentence)

In [2]:
import torch
from transformers import BertweetTokenizer, AutoTokenizer
from tweetNormalizer import normalizeTweet

#line = "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
line = "RT @EXOffical : [ HQ ] 140526 Xiumin , Luhan @ MBC Idol Futsal Championship ( cr . shade of the bloom ) http://t.co/ToBKl76SzP"


tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base")
input_ids = torch.tensor([tokenizer.encode(normalizeTweet(line))])
print(' '.join(tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=False)))

# tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
# input_ids = torch.tensor([tokenizer.encode(line)])
# print(' '.join(tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=True)))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


<s> RT @USER : [ HQ ] 14@@ 05@@ 26 Xi@@ umin , Luhan @USER MBC Idol Fut@@ sal Championship ( cr . shade of the bloom ) HTTPURL </s>


In [49]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
print(tokenizer.vocab_size)
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(num_added_toks, tokenizer.vocab_size)
tokenizer.special_tokens_map

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


64000
0 64000


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': "['@USER', 'HTTPURL']"}

In [50]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
print(tokenizer.vocab_size)
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(num_added_toks, tokenizer.vocab_size)
tokenizer.special_tokens_map

30522
2 30522


{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': "['@USER', 'HTTPURL']"}

In [51]:
tokenizer.additional_special_tokens

['@USER', 'HTTPURL']

In [71]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50257


{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': "['@USER', 'HTTPURL']"}

In [72]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': "['@USER', 'HTTPURL']"}

In [64]:
line = "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"

ENCODER = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(ENCODER)

if ENCODER=="bert-base-uncased":
    # CLS token will work as BOS token, SEP token will work as EOS token
    tokenizer.bos_token = tokenizer.cls_token
    tokenizer.eos_token = tokenizer.sep_token

special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

input_ids = tokenizer.encode(normalizeTweet(line))

for rm_id in [tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id]:
    if rm_id in input_ids:
        input_ids.remove(rm_id)

input_ids = torch.tensor([input_ids])

input_sent = tokenizer.decode(input_ids[0], skip_special_tokens=False)
input_sent

'sc has first two presumptive cases of coronavirus, dhec confirms HTTPURL... via @USER'

In [74]:
# set EncoderDecoderModel
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer

# Train

In [48]:
import nlp
import logging
from datasets import load_metric
from transformers import EncoderDecoderModel, Trainer, TrainingArguments, AutoTokenizer
from datasets import Dataset
import pandas as pd
from tweetNormalizer import normalizeTweet

logging.basicConfig(level=logging.INFO)

#change 6 places

#bert-base-uncased, gpt2, roberta-base, vinai/bertweet-base
RUN_NAME="bertweet2bertweet_share"
ENCODER = "vinai/bertweet-base"
DECODER = "gpt2"
tie_ENCODER_DECODER=False
OUTPUT_DIR="./models/"+RUN_NAME+"/2/"

batch_size = 16   # set batch size here
encoder_length = 128
decoder_length = 128

PATH_TO_TRAIN_DATA = "WNUT2015_dataset/train_data.json"
PATH_TO_VAL_DATA = "WNUT2015_dataset/test_truth.json"
is_normalizeTweet = True

rouge = load_metric('rouge', experiment_id=7)
bleu = load_metric('bleu', experiment_id=7)

#---------------------------------------------------------------------------------------------
# encoder tokenizer
encoder_tokenizer = AutoTokenizer.from_pretrained(ENCODER)

if ENCODER=="bert-base-uncased":
    # CLS token will work as BOS token, SEP token will work as EOS token
    encoder_tokenizer.bos_token = encoder_tokenizer.cls_token
    encoder_tokenizer.eos_token = encoder_tokenizer.sep_token
    
# decoder tokenizer
if DECODER=="gpt2":
    # make sure GPT2 appends EOS in begin and end
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        return outputs
    
    AutoTokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
    decoder_tokenizer = AutoTokenizer.from_pretrained(DECODER)
    # set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
    decoder_tokenizer.pad_token = decoder_tokenizer.unk_token
else:   
    decoder_tokenizer = AutoTokenizer.from_pretrained(DECODER)

if DECODER=="bert-base-uncased":
    # CLS token will work as BOS token, SEP token will work as EOS token
    decoder_tokenizer.bos_token = decoder_tokenizer.cls_token
    decoder_tokenizer.eos_token = decoder_tokenizer.sep_token

if is_normalizeTweet:
    special_tokens_dict = {'additional_special_tokens': ['@USER','HTTPURL']}
    num_added_toks_encoder = encoder_tokenizer.add_special_tokens(special_tokens_dict)
    num_added_toks_decoder = decoder_tokenizer.add_special_tokens(special_tokens_dict)
    print(num_added_toks_encoder, num_added_toks_decoder)
    
# set EncoderDecoderModel
model = EncoderDecoderModel.from_encoder_decoder_pretrained(ENCODER, DECODER, tie_encoder_decoder = tie_ENCODER_DECODER)

if is_normalizeTweet:
    model.encoder.resize_token_embeddings(len(encoder_tokenizer))
    model.encoder.resize_token_embeddings(len(decoder_tokenizer))

INFO:filelock:Lock 46916674058320 acquired on /home/zonghaiyao/.cache/huggingface/datasets/dcd4134ec0ad23f318793c6f8b77745d97efebf4b194bcb3c1ce90f867bec0cc.07d7a1ea4a0063d16947f4a5b5a7ad98ca747e989e62fd0b2c5aa4b606f70aca.py.lock
INFO:filelock:Lock 46916674058320 released on /home/zonghaiyao/.cache/huggingface/datasets/dcd4134ec0ad23f318793c6f8b77745d97efebf4b194bcb3c1ce90f867bec0cc.07d7a1ea4a0063d16947f4a5b5a7ad98ca747e989e62fd0b2c5aa4b606f70aca.py.lock
INFO:filelock:Lock 46916461548048 acquired on /home/zonghaiyao/.cache/huggingface/datasets/c7db30bf448719bd2c2ee7c233832963ab2e0b85e984dda4f577016390fa0e85.7927df63b30f94ac549ad2d2e3c61c5089402aacb0ab0478007e0abfe3431378.py.lock
INFO:filelock:Lock 46916461548048 released on /home/zonghaiyao/.cache/huggingface/datasets/c7db30bf448719bd2c2ee7c233832963ab2e0b85e984dda4f577016390fa0e85.7927df63b30f94ac549ad2d2e3c61c5089402aacb0ab0478007e0abfe3431378.py.lock
Special tokens have been added in the vocabulary, make sure the associated word embe

0 2


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.bias', 'h.0.crossattention.masked_bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.c_proj.bias', 'h.0.ln_cross_attn.weight', 'h.0.ln_cross_attn.bias', 'h.1.crossattention.bias', 'h.1.crossattention.masked_bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.c_proj.bias', 'h.1.ln_cross_attn.weight', 'h.1.ln_cross_attn.bias', 'h.2.crossattention.bias', 'h.2.crossattention.masked_bias', 'h.2.crossattention.c_attn.weight', 'h.2.crossattention.c_attn.bias', 'h.2.crossattention.q_attn.weight', 'h.2.crossattention.q_attn.bias', 'h.2.crossattention.c_proj.weight'

In [46]:
decoder_tokenizer.convert_tokens_to_ids("@USER")

50256

In [9]:
model.config.encoder

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "tokenizer_class": "BertweetTokenizer",
  "type_vocab_size": 1,
  "vocab_size": 64001
}

In [10]:
model.config.decoder

GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "is_decoder": true,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [12]:
model.resize_token_embeddings

<bound method PreTrainedModel.resize_token_embeddings of EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50257, 768)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
       

In [None]:
# set decoding params
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.max_length = decoder_length
model.config.min_length = 0
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

#-------------------------------------------------------------------------------------------
# load train and validation data
train_dataset = pd.read_json(PATH_TO_TRAIN_DATA, orient="records")
val_dataset = pd.read_json(PATH_TO_VAL_DATA, orient="records")

if is_normalizeTweet:
    make_sentence = lambda x : normalizeTweet(" ".join(x)).lower()
else:
    make_sentence = lambda x : " ".join(x).lower()

train_dataset['input_sentence'] = train_dataset['input'].apply(make_sentence)
train_dataset['output_sentence'] = train_dataset['output'].apply(make_sentence)
val_dataset['input_sentence'] = val_dataset['input'].apply(make_sentence)
val_dataset['output_sentence'] = val_dataset['output'].apply(make_sentence)

train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

# map data correctly
def map_to_encoder_decoder_inputs(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = encoder_tokenizer(batch["input_sentence"], padding="max_length", truncation=True, max_length=encoder_length)
    outputs = decoder_tokenizer(batch["output_sentence"], padding="max_length", truncation=True, max_length=decoder_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask
    
    if DECODER=="gpt2":
        # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
        batch["labels"] = [
            [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
        ]
    else:
        # mask loss for padding
        batch["labels"] = [
            [-100 if token == decoder_tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
        ]
    

    assert all([len(x) == encoder_length for x in inputs.input_ids])
    assert all([len(x) == decoder_length for x in outputs.input_ids])
    
    return batch

# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
        
#-----------------------------------------------------------------------------------
# load metrics for validation

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = decoder_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = decoder_tokenizer.eos_token_id
    label_str = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    metrics_rouge = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rouge3", "rouge4", "rougeL", "rougeLsum"])

    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [decoder_tokenizer.convert_ids_to_tokens(seq, **kwargs) for seq in sequences]
    
    pred_tokens = batch_convert_ids_to_tokens(pred_ids, skip_special_tokens=True)
    
    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [[decoder_tokenizer.convert_ids_to_tokens(seq, **kwargs)] for seq in sequences]
    
    label_tokens = batch_convert_ids_to_tokens(labels_ids, skip_special_tokens=True)
    
    metrics_bleu = bleu.compute(predictions=pred_tokens, references=label_tokens)
    
    return {
        "rouge1_precision": round(metrics_rouge['rouge1'].mid.precision, 4),
        "rouge1_recall": round(metrics_rouge['rouge1'].mid.recall, 4),
        "rouge1_fmeasure": round(metrics_rouge['rouge1'].mid.fmeasure, 4),
        "rouge2_precision": round(metrics_rouge['rouge2'].mid.precision, 4),
        "rouge2_recall": round(metrics_rouge['rouge2'].mid.recall, 4),
        "rouge2_fmeasure": round(metrics_rouge['rouge2'].mid.fmeasure, 4),
        "rouge3_precision": round(metrics_rouge['rouge3'].mid.precision, 4),
        "rouge3_recall": round(metrics_rouge['rouge3'].mid.recall, 4),
        "rouge3_fmeasure": round(metrics_rouge['rouge3'].mid.fmeasure, 4),
        "rouge4_precision": round(metrics_rouge['rouge4'].mid.precision, 4),
        "rouge4_recall": round(metrics_rouge['rouge4'].mid.recall, 4),
        "rouge4_fmeasure": round(metrics_rouge['rouge4'].mid.fmeasure, 4),
        "rougeL_precision": round(metrics_rouge['rougeL'].mid.precision, 4),
        "rougeL_recall": round(metrics_rouge['rougeL'].mid.recall, 4),
        "rougeL_fmeasure": round(metrics_rouge['rougeL'].mid.fmeasure, 4),
        "rougeLsum_precision": round(metrics_rouge['rougeLsum'].mid.precision, 4),
        "rougeLsum_recall": round(metrics_rouge['rougeLsum'].mid.recall, 4),
        "rougeLsum_fmeasure": round(metrics_rouge['rougeLsum'].mid.fmeasure, 4),
        "bleu": round(metrics_bleu['bleu'], 4),
    }
        
#----------------------------------------------------------------------------------------
# begin train

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_from_generate=True,
    evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=20,
    save_steps=500,
    eval_steps=300,
    overwrite_output_dir=True,
    warmup_steps=50,
    save_total_limit=3,
    num_train_epochs=30,
    fp16=True,
    run_name=RUN_NAME,
)


# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


# start training
trainer.train()

In [2]:
# # load train and validation data
# train_df = pd.read_json("WNUT2015_dataset/train_data.json", orient="records")
# val_df = pd.read_json("WNUT2015_dataset/test_truth.json", orient="records")

In [12]:
# get_len = lambda x : len(x)

# train_df['input_length'] = train_df['input'].apply(get_len)
# train_df['output_length'] = train_df['output'].apply(get_len)
# val_df['input_length'] = val_df['input'].apply(get_len)
# val_df['output_length'] = val_df['output'].apply(get_len)

# print(max(train_df['input_length']), max(train_df['output_length']), max(val_df['input_length']), max(val_df['output_length']))
# print(min(train_df['input_length']), min(train_df['output_length']), min(val_df['input_length']), min(val_df['output_length']))


41 41 41 41
4 4 4 4


In [48]:
import nlp
import logging
from datasets import load_metric, Dataset
from transformers import AutoTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
import pandas as pd

logging.basicConfig(level=logging.INFO)

model = EncoderDecoderModel.from_encoder_decoder_pretrained("vinai/bertweet-base", "vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

# load train and validation data
train_dataset = pd.read_json("WNUT2015_dataset/train_data.json", orient="records")
val_dataset = pd.read_json("WNUT2015_dataset/test_truth.json", orient="records")

make_sentence = lambda x : " ".join(x)

train_dataset['input_sentence'] = train_dataset['input'].apply(make_sentence)
train_dataset['output_sentence'] = train_dataset['output'].apply(make_sentence)
val_dataset['input_sentence'] = val_dataset['input'].apply(make_sentence)
val_dataset['output_sentence'] = val_dataset['output'].apply(make_sentence)

train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

# load rouge for validation
#rouge = nlp.load_metric("rouge")
rouge = load_metric('rouge', experiment_id=8)
bleu = load_metric('bleu', experiment_id=8)

# set decoding params
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.max_length = 128
model.config.min_length = 0
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4


# map data correctly
def map_to_encoder_decoder_inputs(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = tokenizer(batch["input_sentence"], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(batch["output_sentence"], padding="max_length", truncation=True, max_length=128)

    batch["input_ids"] = inputs.input_ids    
    batch["attention_mask"] = inputs.attention_mask

    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    # mask loss for padding
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]
    batch["decoder_attention_mask"] = outputs.attention_mask

    assert all([len(x) == 128 for x in inputs.input_ids])
    assert all([len(x) == 128 for x in outputs.input_ids])
    
    return batch


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    metrics_rouge = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rouge3", "rouge4", "rougeL", "rougeLsum"])

    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [tokenizer.convert_ids_to_tokens(seq, **kwargs) for seq in sequences]
    
    pred_tokens = batch_convert_ids_to_tokens(pred_ids, skip_special_tokens=True)
    
    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [[tokenizer.convert_ids_to_tokens(seq, **kwargs)] for seq in sequences]
    
    label_tokens = batch_convert_ids_to_tokens(labels_ids, skip_special_tokens=True)
    
    metrics_bleu = bleu.compute(predictions=pred_tokens, references=label_tokens)
    
    return {
        "rouge1_precision": round(metrics_rouge['rouge1'].mid.precision, 4),
        "rouge1_recall": round(metrics_rouge['rouge1'].mid.recall, 4),
        "rouge1_fmeasure": round(metrics_rouge['rouge1'].mid.fmeasure, 4),
        "rouge2_precision": round(metrics_rouge['rouge2'].mid.precision, 4),
        "rouge2_recall": round(metrics_rouge['rouge2'].mid.recall, 4),
        "rouge2_fmeasure": round(metrics_rouge['rouge2'].mid.fmeasure, 4),
        "rouge3_precision": round(metrics_rouge['rouge3'].mid.precision, 4),
        "rouge3_recall": round(metrics_rouge['rouge3'].mid.recall, 4),
        "rouge3_fmeasure": round(metrics_rouge['rouge3'].mid.fmeasure, 4),
        "rouge4_precision": round(metrics_rouge['rouge4'].mid.precision, 4),
        "rouge4_recall": round(metrics_rouge['rouge4'].mid.recall, 4),
        "rouge4_fmeasure": round(metrics_rouge['rouge4'].mid.fmeasure, 4),
        "rougeL_precision": round(metrics_rouge['rougeL'].mid.precision, 4),
        "rougeL_recall": round(metrics_rouge['rougeL'].mid.recall, 4),
        "rougeL_fmeasure": round(metrics_rouge['rougeL'].mid.fmeasure, 4),
        "rougeLsum_precision": round(metrics_rouge['rougeLsum'].mid.precision, 4),
        "rougeLsum_recall": round(metrics_rouge['rougeLsum'].mid.recall, 4),
        "rougeLsum_fmeasure": round(metrics_rouge['rougeLsum'].mid.fmeasure, 4),
        "bleu": round(metrics_bleu['bleu'], 4),
    }



# set batch size here
batch_size = 16

# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# set training arguments - these params are not really tuned, feel free to change
training_args = TrainingArguments(
    output_dir="./models/bertweet2bertweet/1/",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_from_generate=True,
    evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=20,
    save_steps=500,
    eval_steps=300,
    overwrite_output_dir=True,
    warmup_steps=50,
    save_total_limit=3,
    num_train_epochs=30,
    run_name="bertweet2bertweet_notebook",
)


# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


# start training
trainer.train()

Some weights of RobertaForCausalLM were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['roberta.encoder.layer.0.crossattention.self.query.weight', 'roberta.encoder.layer.0.crossattention.self.query.bias', 'roberta.encoder.layer.0.crossattention.self.key.weight', 'roberta.encoder.layer.0.crossattention.self.key.bias', 'roberta.encoder.layer.0.crossattention.self.value.weight', 'roberta.encoder.layer.0.crossattention.self.value.bias', 'roberta.encoder.layer.0.crossattention.output.dense.weight', 'roberta.encoder.layer.0.crossattention.output.dense.bias', 'roberta.encoder.layer.0.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.0.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.1.crossattention.self.query.weight', 'roberta.encoder.layer.1.crossattention.self.query.bias', 'roberta.encoder.layer.1.crossattention.self.key.weight', 'roberta.encoder.layer.1.crossattention.self.key.bias', 'roberta.encoder.layer.1.crossatt

INFO:filelock:Lock 46916157612176 released on /home/zonghaiyao/.cache/huggingface/datasets/c7db30bf448719bd2c2ee7c233832963ab2e0b85e984dda4f577016390fa0e85.7927df63b30f94ac549ad2d2e3c61c5089402aacb0ab0478007e0abfe3431378.py.lock


HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=123.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=185.0, style=ProgressStyle(description_wi…

  return torch.tensor(x, **format_kwargs)
ERROR:wandb.jupyter:Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Currently logged in as: iesl-boxes (use `wandb login --relogin` to force relogin)
wandb: Tracking run with wandb version 0.10.2
wandb: Run data is saved locally in wandb/run-20200928_013639-2d78mctx
wandb: Syncing run bertweet2bertweet_notebook



{'loss': 12.130699920654298, 'learning_rate': 2e-05, 'epoch': 0.10810810810810811, 'total_flos': 146877438197760, 'step': 20}
{'loss': 8.481720733642579, 'learning_rate': 4e-05, 'epoch': 0.21621621621621623, 'total_flos': 293754876395520, 'step': 40}
{'loss': 7.3166015625, 'learning_rate': 4.990909090909091e-05, 'epoch': 0.32432432432432434, 'total_flos': 440632314593280, 'step': 60}
{'loss': 7.158120727539062, 'learning_rate': 4.9727272727272725e-05, 'epoch': 0.43243243243243246, 'total_flos': 587509752791040, 'step': 80}
{'loss': 7.030865478515625, 'learning_rate': 4.9545454545454553e-05, 'epoch': 0.5405405405405406, 'total_flos': 734387190988800, 'step': 100}
{'loss': 7.060699462890625, 'learning_rate': 4.936363636363637e-05, 'epoch': 0.6486486486486487, 'total_flos': 881264629186560, 'step': 120}
{'loss': 7.253659057617187, 'learning_rate': 4.9181818181818183e-05, 'epoch': 0.7567567567567568, 'total_flos': 1028142067384320, 'step': 140}


KeyboardInterrupt: 

In [1]:
import nlp
import logging
from datasets import load_metric
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

logging.basicConfig(level=logging.INFO)

model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# CLS token will work as BOS token
tokenizer.bos_token = tokenizer.cls_token

# SEP token will work as EOS token
tokenizer.eos_token = tokenizer.sep_token

# load train and validation data
train_dataset = pd.read_json("WNUT2015_dataset/train_data.json", orient="records")
val_dataset = pd.read_json("WNUT2015_dataset/test_truth.json", orient="records")

train_dataset = train_dataset[:100]
val_dataset = val_dataset[:5]

make_sentence = lambda x : " ".join(x)

train_dataset['input_sentence'] = train_dataset['input'].apply(make_sentence)
train_dataset['output_sentence'] = train_dataset['output'].apply(make_sentence)
val_dataset['input_sentence'] = val_dataset['input'].apply(make_sentence)
val_dataset['output_sentence'] = val_dataset['output'].apply(make_sentence)

train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

# load rouge for validation
rouge = load_metric('rouge', experiment_id=1)
bleu = load_metric('bleu', experiment_id=1)

# set decoding params
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.max_length = 64
model.config.min_length = 0
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4


# map data correctly
def map_to_encoder_decoder_inputs(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["input_sentence"], padding="max_length", truncation=True, max_length=64)
    # inputs = tokenizer(batch["input_sentence"], padding="max_length")
    # force summarization <= 128
    outputs = tokenizer(batch["output_sentence"], padding="max_length", truncation=True, max_length=64)
    # outputs = tokenizer(batch["output_sentence"], padding="max_length")

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    # mask loss for padding
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]
    batch["decoder_attention_mask"] = outputs.attention_mask

    assert all([len(x) == 64 for x in inputs.input_ids])
    assert all([len(x) == 64 for x in outputs.input_ids])
    
    return batch


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    metrics_rouge = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rouge3", "rouge4", "rougeL", "rougeLsum"])
    
    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [tokenizer.convert_ids_to_tokens(seq, **kwargs) for seq in sequences]
    
    pred_tokens = batch_convert_ids_to_tokens(pred_ids, skip_special_tokens=True)
    
    def batch_convert_ids_to_tokens(sequences, **kwargs):
        return [[tokenizer.convert_ids_to_tokens(seq, **kwargs)] for seq in sequences]
    
    label_tokens = batch_convert_ids_to_tokens(labels_ids, skip_special_tokens=True)
    
    metrics_bleu = bleu.compute(predictions=pred_tokens, references=label_tokens)
    
    return {
        "rouge1_precision": round(metrics_rouge['rouge1'].mid.precision, 4),
        "rouge1_recall": round(metrics_rouge['rouge1'].mid.recall, 4),
        "rouge1_fmeasure": round(metrics_rouge['rouge1'].mid.fmeasure, 4),
        "rouge2_precision": round(metrics_rouge['rouge2'].mid.precision, 4),
        "rouge2_recall": round(metrics_rouge['rouge2'].mid.recall, 4),
        "rouge2_fmeasure": round(metrics_rouge['rouge2'].mid.fmeasure, 4),
        "rouge3_precision": round(metrics_rouge['rouge3'].mid.precision, 4),
        "rouge3_recall": round(metrics_rouge['rouge3'].mid.recall, 4),
        "rouge3_fmeasure": round(metrics_rouge['rouge3'].mid.fmeasure, 4),
        "rouge4_precision": round(metrics_rouge['rouge4'].mid.precision, 4),
        "rouge4_recall": round(metrics_rouge['rouge4'].mid.recall, 4),
        "rouge4_fmeasure": round(metrics_rouge['rouge4'].mid.fmeasure, 4),
        "rougeL_precision": round(metrics_rouge['rougeL'].mid.precision, 4),
        "rougeL_recall": round(metrics_rouge['rougeL'].mid.recall, 4),
        "rougeL_fmeasure": round(metrics_rouge['rougeL'].mid.fmeasure, 4),
        "rougeLsum_precision": round(metrics_rouge['rougeLsum'].mid.precision, 4),
        "rougeLsum_recall": round(metrics_rouge['rougeLsum'].mid.recall, 4),
        "rougeLsum_fmeasure": round(metrics_rouge['rougeLsum'].mid.fmeasure, 4),
        "bleu": round(metrics_bleu['bleu'], 4),
    }


# set batch size here
batch_size = 16

# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["input_sentence", "output_sentence"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# set training arguments - these params are not really tuned, feel free to change
training_args = TrainingArguments(
    output_dir="./models/bert2bert/1/",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_from_generate=True,
    evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=20,
    save_steps=500,
    eval_steps=30,
    overwrite_output_dir=True,
    warmup_steps=50,
    save_total_limit=3,
    num_train_epochs=30,
    fp16=True,
    run_name="bert2bert",
)

# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


# start training
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer

INFO:filelock:Lock 46916595468880 acquired on /home/zonghaiyao/.cache/huggingface/datasets/c7db30bf448719bd2c2ee7c233832963ab2e0b85e984dda4f577016390fa0e85.7927df63b30f94ac549ad2d2e3c61c5089402aacb0ab0478007e0abfe3431378.py.lock
INFO:filelock:Lock 46916595468880 released on /home/zonghaiyao/.cache/huggingface/datasets/c7db30bf448719bd2c2ee7c233832963ab2e0b85e984dda4f577016390fa0e85.7927df63b30f94ac549ad2d2e3c61c5089402aacb0ab0478007e0abfe3431378.py.lock


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…

  return torch.tensor(x, **format_kwargs)





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…

ERROR:wandb.jupyter:Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Currently logged in as: iesl-boxes (use `wandb login --relogin` to force relogin)
wandb: Tracking run with wandb version 0.10.2
wandb: Run data is saved locally in wandb/run-20200926_141934-zar25jca
wandb: Syncing run bert2bert



{'loss': 9.228541564941406, 'learning_rate': 2e-05, 'epoch': 2.857142857142857, 'total_flos': 56366881910784, 'step': 20}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1.0, style=ProgressStyle(description_wid…

Setting `pad_token_id` to 102 (first `eos_token_id`) to generate sequence
INFO:filelock:Lock 46915959187280 acquired on /home/zonghaiyao/.cache/huggingface/metrics/rouge/default/1-1-0.arrow.lock
INFO:filelock:Lock 46915959187280 released on /home/zonghaiyao/.cache/huggingface/metrics/rouge/default/1-1-0.arrow.lock
INFO:filelock:Lock 46915959182416 acquired on /home/zonghaiyao/.cache/huggingface/metrics/rouge/default/1-1-0.arrow.lock





INFO:/mnt/nfs/work1/llcao/zonghaiyao/tweetNorm/datasets/src/datasets/metric.py:Removing /home/zonghaiyao/.cache/huggingface/metrics/rouge/default/1-1-0.arrow
INFO:filelock:Lock 46915959182416 released on /home/zonghaiyao/.cache/huggingface/metrics/rouge/default/1-1-0.arrow.lock
INFO:filelock:Lock 46916599933648 acquired on /home/zonghaiyao/.cache/huggingface/metrics/bleu/default/1-1-0.arrow.lock
INFO:filelock:Lock 46916599933648 released on /home/zonghaiyao/.cache/huggingface/metrics/bleu/default/1-1-0.arrow.lock
INFO:filelock:Lock 46915958811088 acquired on /home/zonghaiyao/.cache/huggingface/metrics/bleu/default/1-1-0.arrow.lock
INFO:/mnt/nfs/work1/llcao/zonghaiyao/tweetNorm/datasets/src/datasets/metric.py:Removing /home/zonghaiyao/.cache/huggingface/metrics/bleu/default/1-1-0.arrow
INFO:filelock:Lock 46915958811088 released on /home/zonghaiyao/.cache/huggingface/metrics/bleu/default/1-1-0.arrow.lock


{'eval_loss': 7.500368595123291, 'eval_rouge1_precision': 0.0733, 'eval_rouge1_recall': 0.0077, 'eval_rouge1_fmeasure': 0.0139, 'eval_rouge2_precision': 0.0, 'eval_rouge2_recall': 0.0, 'eval_rouge2_fmeasure': 0.0, 'eval_rouge3_precision': 0.0, 'eval_rouge3_recall': 0.0, 'eval_rouge3_fmeasure': 0.0, 'eval_rouge4_precision': 0.0, 'eval_rouge4_recall': 0.0, 'eval_rouge4_fmeasure': 0.0, 'eval_rougeL_precision': 0.0733, 'eval_rougeL_recall': 0.0077, 'eval_rougeL_fmeasure': 0.0139, 'eval_rougeLsum_precision': 0.0733, 'eval_rougeLsum_recall': 0.0077, 'eval_rougeLsum_fmeasure': 0.0139, 'eval_bleu': 0.0, 'epoch': 4.285714285714286, 'total_flos': 82265179004928, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…

{'loss': 7.050238037109375, 'learning_rate': 4e-05, 'epoch': 5.714285714285714, 'total_flos': 110448619960320, 'step': 40}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=7.0, style=ProgressStyle(description_widt…

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
batch["labels"] = 
[[-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]