In [1]:
%pip install evaluate
%pip install rouge-score
%pip install transformers
%pip install wandb

Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.3.0
[0mNote: you may need to restart the kernel to use updated packages.
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=777c915bac202afa124705a490001a928de7fe13da392c80e38e9f8e015faf3e
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[0mNote: you may need to restart the kern

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd drive/MyDrive/diff-lm/
%ls

/content/drive/MyDrive/diff-lm
[0m[01;34mbase[0m/  [01;34mlogs[0m/                 [01;34mmodelsbo[0m/    [01;34mmodelstabd-1[0m/  [01;34mwandb[0m/
[01;34mdata[0m/  [01;34mmodels-base-uncased[0m/  [01;34mmodelstabd[0m/  samples.txt


In [2]:
import json
import os
# os.listdir('data')
# data = []
# with open("data/calendar.dev.jsonl") as f:
#     for line in f:
#         a=json.loads(line)
#         a["formula"] = a["formula"].replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
#         data.append(a)

# print(data[0])

In [3]:
DOMAINS = (
    "calendar",
    "basketball",
    "blocks",
    "housing",
    "publications",
    "recipes",
    "restaurants",
    "socialnetwork",
)

def get_data(domain, dataset="train_with_dev"):
    data = []
    with open("data/" + domain + "." + dataset + ".jsonl") as f:
        for line in f:
            record = json.loads(line)
            record["formula"] = simplifier(record["formula"])
            data.append(record)
    return data

simplifier = lambda txt: txt.replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
train_all = {}
test_all = {}
for domain in DOMAINS:
    train_all[domain] = get_data(domain)
    test_all[domain] = get_data(domain, dataset="test")

# train_all[DOMAINS[0]][0]
# dev_all[DOMAINS[0]][0]
test_all[DOMAINS[0]][0]

{'canonical': 'meeting whose end time is larger than 10am or 3pm',
 'formula': '(call listValue (call filter (call getProperty (call singleton en.meeting) (string !type)) (call ensureNumericProperty (string end_time)) (string >) (call ensureNumericEntity (call concat (time 10 0) (time 15 0)))))',
 'natural': 'which meetings end later than 10 in the morning or 3 in the afternoon'}

In [5]:
import numpy as np

def split_train_dev(domain, domains_data, train_size=200, remain_dev=0.2, shuffle=True):
  data = domains_data[domain]
  if shuffle:
    np.random.shuffle(data)
  size = len(data)
  dev_size = np.ceil((size - train_size) * 0.2).astype(int) + train_size
  return data[:train_size], data[train_size:dev_size]

train_dict = {}
dev_dict = {}
for domain in DOMAINS:
  train_dict[domain], dev_dict[domain] = split_train_dev(domain, train_all)

len(train_dict[DOMAINS[0]])

200

In [6]:
def prepare_data(data, shuffle=True):
  inputs = []
  outputs = []
  for domain in DOMAINS:
    domain_data = data[domain]
    if shuffle:
      np.random.shuffle(domain_data)
    for record in domain_data:
      inputs.append(record['natural'])
      outputs.append(record['canonical'])
  return inputs, outputs

In [7]:
import math
from torch.utils.data.dataset import Dataset
from transformers import AutoModelForPreTraining,AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F 
import random

In [9]:
import wandb

wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
%env WANDB_PROJECT=diff_lm_semantic_parsing

env: WANDB_PROJECT=diff_lm_semantic_parsing


In [10]:
training_args = TrainingArguments(
    report_to = 'wandb',  
    run_name="true-diff-final-170",
    output_dir='./true-diff-final-170',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    eval_accumulation_steps=10,
    label_names=['labels'],
)

In [11]:
class OvernightDataset(Dataset): 
    def __init__(self, data, init_model, max_len, func=prepare_data):
        self.tokenizer = AutoTokenizer.from_pretrained(init_model)
        self.inputs, self.labels = func(data)
        self.max_len = max_len
        self.tokenizer.model_max_length = max_len
    def __getitem__(self, index):
        from_tokenizer = self.tokenizer(self.inputs[index],padding="max_length",truncation = True,return_tensors="pt")
        label_tokens = self.tokenizer(self.labels[index],padding="max_length",truncation = True,return_tensors="pt")
        input_ids = from_tokenizer["input_ids"].squeeze_().long()
        ret_labels = label_tokens["input_ids"].squeeze_().long()
        token_type_ids = from_tokenizer["token_type_ids"].squeeze_().long()
        attention_mask = from_tokenizer["attention_mask"].squeeze_().long()
        labels_token_type_ids = label_tokens["token_type_ids"].squeeze_().long()
        labels_attention_mask = label_tokens["attention_mask"].squeeze_().long()
        # return input_ids,token_type_ids,attention_mask
        return {"input_ids": input_ids, 
                "token_type_ids" : token_type_ids, 
                "attention_mask" : attention_mask, 
                "labels" : ret_labels, 
                "labels_token_type_ids" : labels_token_type_ids, 
                "labels_attention_mask" : labels_attention_mask}
    def __len__(self):
        return len(self.labels)

In [12]:
class diffusion_bert(nn.Module):
    def __init__(self,init_model,max_len,max_step,k=1) -> None:
        super().__init__()
        if "bert-base" in init_model:
            self.model = AutoModelForMaskedLM.from_pretrained(init_model)
            freezed_w = [self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        else:
            self.model = AutoModelForPreTraining.from_pretrained(init_model)
            freezed_w = [self.model.cls.seq_relationship.bias, self.model.cls.seq_relationship.weight, self.model.bert.pooler.dense.bias, self.model.bert.pooler.dense.weight, self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        self.max_len = max_len
        self.max_step = max_step
        self.k=k
        self.time_embed = nn.Embedding(max_step,self.model.config.hidden_size)
        #self.layernorm = nn.LayerNorm(self.model.config.hidden_size, eps=self.model.config.layer_norm_eps)
        for p in  freezed_w:
            p.requires_grad = False
        nn.init.constant_(self.time_embed.weight, 0)
    def forward(self,input_ids,token_type_ids,attention_mask, labels, labels_token_type_ids, labels_attention_mask):
        t = self.max_step
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        
        position_ids = self.model.bert.embeddings.position_ids[:, 0 : seq_length]
        position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)


        # Trial 31:
        output_shape = labels.size()
        out_seq_length = output_shape[1]
        N = input_shape[0]
        # outpos_ids = self.model.bert.embeddings.position_ids[:, 0 : out_seq_length]
        # out_pos_embeddings = self.model.bert.embeddings.position_embeddings(outpos_ids)

       
        with torch.no_grad():
            target_emb = self.model.bert.embeddings.word_embeddings(labels)
            inp_emb = self.model.bert.embeddings.word_embeddings(input_ids)
        #print(word_emb.shape)
            token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
        # labels_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(labels_token_type_ids)
        
        
            xt = torch.normal(0,1,(N,self.max_len,self.model.config.hidden_size)).to(device) #/ math.sqrt(self.model.config.hidden_size)
            xt_token_type_ids = torch.zeros(N,self.max_len).long().to(device)
            attention_mask = torch.ones(N,self.max_len).long().to(device)
            extended_attention_mask = self.model.bert.get_extended_attention_mask(attention_mask, attention_mask.shape)
            xt_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
        xt_position_ids = self.model.bert.embeddings.position_ids[:, 0 : self.max_len]
        xt_position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)
        
        
        
        loss_x0 = None
        for t in range(self.max_step-1,0,-1):
            # print("Step", t)
            diffusion_steps = torch.ones(size = (output_shape[0],),device=input_ids.device).long()*t
            time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)

            model_input = inp_emb+xt+position_embeddings+xt_position_embeddings+time_embedding
            model_input = self.model.bert.embeddings.LayerNorm(model_input)
            #denoise
            encoder_outputs = self.model.bert.encoder(
                model_input,
                attention_mask=extended_attention_mask,
                head_mask=[None] * self.model.config.num_hidden_layers
            )
            sequence_output = encoder_outputs[0]
            prediction_scores = self.model.cls.predictions(sequence_output)

            #clamp
            # pred = torch.argmax(prediction_scores,-1).long()
            # denoised_word = self.model.bert.embeddings.word_embeddings(pred)
            denoised_word = prediction_scores.softmax(-1) @ self.model.bert.embeddings.word_embeddings.weight.unsqueeze(0)

            if loss_x0 == None:
            # loss_x0 = F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
                loss_x0 = F.mse_loss(denoised_word, target_emb)
            else:
            #     loss_x0 += F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
                loss_x0 += F.mse_loss(denoised_word, target_emb)
            #DDIM
            alpha_tk = 1 - math.sqrt((t)/self.max_step)#+1e-5
            alpha_t = 1 - math.sqrt((t+1)/self.max_step)+1e-5
            noise = (xt - math.sqrt(alpha_t)*denoised_word)/math.sqrt(1-alpha_t)
            xt = math.sqrt(alpha_tk)*(xt/math.sqrt(alpha_t) + (math.sqrt((1-alpha_tk)/alpha_tk) - math.sqrt((1-alpha_t)/alpha_t))*noise)
            #noisy_word = math.sqrt(alpha_tk)*denoised_word + math.sqrt(1-alpha_tk)*noise
            
            
            
        
        prediction_scores = self.model.cls.predictions(xt)
        loss_emb = F.mse_loss(xt, target_emb)
        loss_round = F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
        
        loss = loss_x0 + loss_emb + loss_round
        #loss = F.smooth_l1_loss(sequence_output,word_emb)
        return loss,prediction_scores,labels
    
    
    
    
#     def foval(self,input_ids,token_type_ids,attention_mask, labels, labels_token_type_ids, labels_attention_mask):
#         t = self.max_step
#         input_shape = input_ids.size()
#         seq_length = input_shape[1]
        
#         position_ids = self.model.bert.embeddings.position_ids[:, 0 : seq_length]
#         position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)


#         # Trial 31:
#         output_shape = labels.size()
#         out_seq_length = output_shape[1]
#         N = input_shape[0]
#         # outpos_ids = self.model.bert.embeddings.position_ids[:, 0 : out_seq_length]
#         # out_pos_embeddings = self.model.bert.embeddings.position_embeddings(outpos_ids)

       
#         # with torch.no_grad():
#         target_emb = self.model.bert.embeddings.word_embeddings(labels)
#         inp_emb = self.model.bert.embeddings.word_embeddings(input_ids)
#         #print(word_emb.shape)
#         token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
#         # labels_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(labels_token_type_ids)
        
        
#         xt = torch.normal(0,1,(N,self.max_len,self.model.config.hidden_size)).to(device) #/ math.sqrt(self.model.config.hidden_size)
#         xt_token_type_ids = torch.zeros(N,self.max_len).long().to(device)
#         attention_mask = torch.ones(N,self.max_len).long().to(device)
#         extended_attention_mask = self.model.bert.get_extended_attention_mask(attention_mask, attention_mask.shape)

#         xt_position_ids = self.model.bert.embeddings.position_ids[:, 0 : self.max_len]
#         xt_position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)
#         xt_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
        
        
#         # loss_x0 = None
#         for t in range(self.max_step-1,0,-1):
#             # print("Step", t)
#             # exit()
#             diffusion_steps = torch.ones(size = (N,),device=device).long()*t
#             time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)

#             model_input = inp_emb+xt+position_embeddings+xt_position_embeddings+time_embedding
#             model_input = self.model.bert.embeddings.LayerNorm(model_input)
#             #denoise
#             encoder_outputs = self.model.bert.encoder(
#                 model_input,
#                 attention_mask=extended_attention_mask,
#                 head_mask=[None] * self.model.config.num_hidden_layers
#             )
#             sequence_output = encoder_outputs[0]
#             prediction_scores = self.model.cls.predictions(sequence_output)

#             #clamp
#             pred = torch.argmax(prediction_scores,-1).long()
#             denoised_word = self.model.bert.embeddings.word_embeddings(pred)
#             # denoised_word = prediction_scores.softmax(-1) @ self.model.bert.embeddings.word_embeddings.weight.unsqueeze(0)

#             # if loss_x0 == None:
#             #     loss_x0 = F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
#             #     # loss_x0 = F.mse_loss(denoised_word, target_emb)
#             # else:
#             #     loss_x0 += F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
#                 # loss_x0 += F.mse_loss(denoised_word, target_emb)
#             #DDIM
#             alpha_tk = 1 - math.sqrt((t)/self.max_step)#+1e-5
#             alpha_t = 1 - math.sqrt((t+1)/self.max_step)+1e-5
#             noise = (xt - math.sqrt(alpha_t)*denoised_word)/math.sqrt(1-alpha_t)
#             xt = math.sqrt(alpha_tk)*(xt/math.sqrt(alpha_t) + (math.sqrt((1-alpha_tk)/alpha_tk) - math.sqrt((1-alpha_t)/alpha_t))*noise)
#             #noisy_word = math.sqrt(alpha_tk)*denoised_word + math.sqrt(1-alpha_tk)*noise
            
            
            
        
#         prediction_scores = self.model.cls.predictions(xt)
#         # loss_emb = torch.norm(target_emb - xt)
#         loss_round = F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
        
#         loss = loss_round #/ self.max_step + loss_emb + loss_round
#         #loss = F.smooth_l1_loss(sequence_output,word_emb)
#         return loss,prediction_scores,labels

    # def test_pretrained(self,input_ids,token_type_ids,attention_mask):
    #     loss,prediction_scores,diffusion_steps = self.forward(input_ids,token_type_ids,attention_mask,0)
    #     return loss,prediction_scores,diffusion_steps


#     @torch.no_grad()
#     def sampler(self,device, batch, k=1):
#         import time
        
#         start_time = time.time()
#         inp_ids = batch['input_ids']
#         inp_token_type_ids = batch['token_type_ids']
#         inp_shape = inp_ids.size()
#         N = inp_shape[0]
#         inp_pos_ids = self.model.bert.embeddings.position_ids[:, 0 : inp_shape[1]]
#         inp_position_embeddings = self.model.bert.embeddings.position_embeddings(inp_pos_ids)
#         inp_emb = self.model.bert.embeddings.word_embeddings(inp_ids)
#         # mean,std = stats
#         # mean = torch.tensor(mean).view(1,3,1,1)
#         # std = torch.tensor(std).view(1,3,1,1)    
#         noisy_word = torch.normal(0,1,(N,self.max_len,self.model.config.hidden_size)).to(device) #/ math.sqrt(self.model.config.hidden_size)
#         token_type_ids = torch.zeros(N,self.max_len).long().to(device)
#         attention_mask = torch.ones(N,self.max_len).long().to(device)
#         extended_attention_mask = self.model.bert.get_extended_attention_mask(attention_mask, attention_mask.shape)

#         position_ids = self.model.bert.embeddings.position_ids[:, 0 : self.max_len]
#         position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)
#         token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
#         for t in range(self.max_step-1,0,-k):
#         #for t in range(1999,0,-1):

#             #prepare time emb
#             diffusion_steps = torch.ones(size = (N,),device=device).long()*t
#             time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)

#             model_input = inp_emb+noisy_word+inp_position_embeddings+position_embeddings+time_embedding
#             model_input = self.model.bert.embeddings.LayerNorm(model_input)
#             #denoise
#             encoder_outputs = self.model.bert.encoder(
#                 model_input,
#                 attention_mask=extended_attention_mask,
#                 head_mask=[None] * self.model.config.num_hidden_layers
#             )
#             sequence_output = encoder_outputs[0]
#             prediction_scores = self.model.cls.predictions(sequence_output)

#             #clamp
#             pred = torch.argmax(prediction_scores,-1).long()
#             denoised_word = self.model.bert.embeddings.word_embeddings(pred)
#             # denoised_word = prediction_scores.softmax(-1) @ self.model.bert.embeddings.word_embeddings.weight.unsqueeze(0)
        
#             #DDIM
#             alpha_tk = 1 - math.sqrt((t+1-k)/self.max_step)#+1e-5
#             alpha_t = 1 - math.sqrt((t+1)/self.max_step)+1e-5
#             noise = (noisy_word - math.sqrt(alpha_t)*denoised_word)/math.sqrt(1-alpha_t)
#             noisy_word = math.sqrt(alpha_tk)*(noisy_word/math.sqrt(alpha_t) + (math.sqrt((1-alpha_tk)/alpha_tk) - math.sqrt((1-alpha_t)/alpha_t))*noise)
#             #noisy_word = math.sqrt(alpha_tk)*denoised_word + math.sqrt(1-alpha_tk)*noise
#             print(f"\rnoise level {t}  {time.time()-start_time:.2f}",end='')
        
#         pred = torch.argmax(prediction_scores,-1).long()
#         return pred

In [13]:
import evaluate
def compute_metrics(eval_preds):
    metric = evaluate.load('rouge')
    logits, labels = eval_preds
    # print(len(logits), len(logits[0]), len(logits[0][0]), len(logits[0][0][0]))
    # print(len(labels), len(labels[0]), len(labels[0][0]))
    # return
    predictions = np.argmax(logits[0], axis=-1)
    preds = [train_set.tokenizer.decode(s) for s in predictions]
    refs = [ train_set.tokenizer.decode(s) for s in labels]
    # lab = np.argmax(labels, axis=-1)
    return metric.compute(predictions=preds, references=refs)

In [16]:
from collections.abc import Mapping
def nested_detach(tensors):
    "Detach `tensors` (even if it's a nested list/tuple/dict of tensors)."
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_detach(t) for t in tensors)
    elif isinstance(tensors, Mapping):
        return type(tensors)({k: nested_detach(t) for k, t in tensors.items()})
    return tensors.detach()
class CustomTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys):
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None

        with torch.no_grad():
            
#             if has_labels:
#                 with self.compute_loss_context_manager():
#                     loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
#                 loss = loss.mean().detach()

#                 if isinstance(outputs, dict):
#                     logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
#                 else:
#                     logits = outputs[1:]
#             else:
            loss = None
            with self.compute_loss_context_manager():
                outputs = model.foval(**inputs)
            if isinstance(outputs, dict):
                logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
            else:
                logits = outputs
            # TODO: this needs to be fixed and made cleaner later.
            if self.args.past_index >= 0:
                self._past = outputs[self.args.past_index - 1]

        if prediction_loss_only:
            return (loss, None, None)

        logits = nested_detach(logits)
        if len(logits) == 1:
            logits = logits[0]

        return (loss, logits, labels)

In [14]:
max_len = 64
diff_step = 170
initializing = 'base/bert-tiny'#'base/bert-mini'
checkpoint = 'true-diff/checkpoint-2000'
device = torch.device('cuda')
model = diffusion_bert(initializing,max_len,diff_step)
state = torch.load(initializing+'/pytorch_model.bin', map_location=device) #"/Saved_Models/20220903bert_diffusion/bestloss.pkl")

model_dict = model.state_dict()
# 1. filter out unnecessary keys
if list(state.keys())[0].startswith("module."):
    state = {k[7:]: v for k, v in state.items() if k[7:] in model_dict}
else:
    state = {k: v for k, v in state.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(state)
# 3. load the new state dict
model.load_state_dict(model_dict)

# model.load_state_dict(state,strict=True)
model = model.to(device)
model.eval()
print("Trial 31")

train_set = OvernightDataset(train_dict, init_model=initializing, max_len=max_len)
val_set = OvernightDataset(dev_dict, init_model=initializing, max_len=max_len)
test_set = OvernightDataset(test_all, init_model=initializing, max_len=max_len)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_set,         # training dataset
    eval_dataset=val_set,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
wandb.finish()

# print("Start decoding")

# out = model.sampler(device, 10, 128)
# with open("samples.txt", 'w', encoding="utf-8") as f:
#     for s in out:
#         sample = test_set.tokenizer.decode(s.cpu().flatten())
#         f.write(sample+"\n")  


Trial 31


***** Running training *****
  Num examples = 1600
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mquangminhdinh[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,15.5268,15.349535,0.012917,0.000212,0.012587,0.012568
2,14.8291,14.845671,0.016261,0.000317,0.01553,0.015471
3,14.3522,14.054068,0.029379,0.000559,0.026392,0.026377
4,12.9421,12.695997,0.048673,0.001455,0.042313,0.042287
5,11.7431,11.668353,0.055378,0.003399,0.050236,0.050262
6,11.0076,10.834443,0.056188,0.004505,0.051883,0.051899
7,10.4598,10.221531,0.057022,0.005448,0.053524,0.053503
8,9.9262,9.738289,0.058515,0.005732,0.054436,0.054418
9,9.5368,9.388222,0.060949,0.005957,0.05561,0.055628
10,9.3615,9.133805,0.064776,0.00608,0.058312,0.058286


***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
Saving model checkpoint to ./true-diff-final-170/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
Saving model checkpoint to ./true-diff-final-170/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64
***** Running Evaluat

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,██▇▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁
eval/rouge1,▁▁▃▅▆▆▆▆▆▇▇▇████████
eval/rouge2,▁▁▁▂▄▆▇▇▇▇██████████
eval/rougeL,▁▁▃▅▆▆▆▆▆▇▇▇▇███████
eval/rougeLsum,▁▁▃▅▆▆▆▆▆▇▇▇▇███████
eval/runtime,▂▁▁▅▂█▄▇▆▇▅▅▄█▆▆▅▆█▇
eval/samples_per_second,▇██▄▇▁▅▂▃▂▄▄▅▁▃▃▄▃▁▂
eval/steps_per_second,▇██▅▇▁▅▂▃▂▅▄▅▁▃▃▄▃▁▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/loss,8.27863
eval/rouge1,0.07583
eval/rouge2,0.00669
eval/rougeL,0.06835
eval/rougeLsum,0.06832
eval/runtime,72.1565
eval/samples_per_second,25.93
eval/steps_per_second,0.416
train/epoch,20.0
train/global_step,2000.0


In [14]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_set, batch_size=64, shuffle=False)
batch = next(iter(test_dataloader))
for key, value in batch.items():
    batch[key] = batch[key].to(device)
batch['labels']

tensor([[ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        ...,
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0]], device='cuda:0')

In [15]:
trainer.predict(test_set)

***** Running Prediction *****
  Num examples = 2740
  Batch size = 64


PredictionOutput(predictions=(array([[[ -4.898205  ,  -4.515242  ,  -5.566231  , ...,  -5.113649  ,
          -6.22801   ,  -9.755732  ],
        [ -4.7080307 ,  -4.8124537 ,  -5.517356  , ...,  -5.985125  ,
          -6.440933  , -11.015397  ],
        [ -4.9037647 ,  -5.769612  ,  -5.762281  , ...,  -6.3659887 ,
          -6.847018  ,  -7.334628  ],
        ...,
        [ -8.045854  ,  -8.809026  ,  -9.20293   , ..., -10.893935  ,
         -10.213099  , -11.537753  ],
        [ -8.121555  ,  -8.3441515 ,  -9.676844  , ...,  -9.980815  ,
          -9.632717  , -12.442889  ],
        [ -5.500577  ,  -5.040684  ,  -5.073077  , ...,  -6.2018003 ,
          -6.398684  ,  -8.676195  ]],

       [[ -1.453227  ,  -1.3636385 ,  -1.7060838 , ...,  -2.4780207 ,
          -3.263864  ,  -5.6988497 ],
        [ -0.67650926,  -1.2274568 ,  -1.0702784 , ...,  -2.6252246 ,
          -2.7142606 ,  -5.723042  ],
        [ -7.035838  ,  -6.83175   ,  -8.184712  , ...,  -9.846008  ,
          -7.7568445 

In [16]:
def prepare_domain_data(data, domain):
    inputs = []
    outputs = []
    domain_data = data[domain]
    for record in domain_data:
        inputs.append(record['natural'])
        outputs.append(record['canonical'])
    return inputs, outputs

for dom in DOMAINS:
    test_dom = OvernightDataset(test_all, init_model=initializing, max_len=max_len, func=lambda data : prepare_domain_data(data, dom))
    ret = trainer.predict(test_dom)
    print(dom, ":", ret.metrics)
    print()

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

calendar : {'test_loss': 8.21792221069336, 'test_rouge1': 0.06783581356972464, 'test_rouge2': 0.0045402553508246245, 'test_rougeL': 0.06032255768718671, 'test_rougeLsum': 0.06031259060720835, 'test_runtime': 6.1413, 'test_samples_per_second': 27.356, 'test_steps_per_second': 0.488}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

basketball : {'test_loss': 8.02482795715332, 'test_rouge1': 0.08424357310964287, 'test_rouge2': 0.00068388263428498, 'test_rougeL': 0.07239258454819164, 'test_rougeLsum': 0.07235853776563163, 'test_runtime': 13.9203, 'test_samples_per_second': 28.088, 'test_steps_per_second': 0.503}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

blocks : {'test_loss': 7.479801177978516, 'test_rouge1': 0.0884600613646698, 'test_rouge2': 0.011811837842164474, 'test_rougeL': 0.08005419050726098, 'test_rougeLsum': 0.08013123123130933, 'test_runtime': 14.246, 'test_samples_per_second': 28.008, 'test_steps_per_second': 0.491}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

housing : {'test_loss': 9.14459228515625, 'test_rouge1': 0.07042228241661184, 'test_rouge2': 0.005688391554043597, 'test_rougeL': 0.06416416427697036, 'test_rougeLsum': 0.064200116634095, 'test_runtime': 6.515, 'test_samples_per_second': 29.01, 'test_steps_per_second': 0.46}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

publications : {'test_loss': 9.366532325744629, 'test_rouge1': 0.07365486754967786, 'test_rouge2': 0.006461402857951025, 'test_rougeL': 0.06717521711296856, 'test_rougeLsum': 0.06724217523165307, 'test_runtime': 6.0529, 'test_samples_per_second': 26.599, 'test_steps_per_second': 0.496}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

recipes : {'test_loss': 8.897368431091309, 'test_rouge1': 0.05899093856022335, 'test_rouge2': 0.0033138579113045413, 'test_rougeL': 0.05401114719701902, 'test_rougeLsum': 0.05400464527974243, 'test_runtime': 7.9368, 'test_samples_per_second': 27.215, 'test_steps_per_second': 0.504}



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

restaurants : {'test_loss': 8.255416870117188, 'test_rouge1': 0.0764755262219089, 'test_rouge2': 0.010116724522269457, 'test_rougeL': 0.06908001325285014, 'test_rougeLsum': 0.06904295824151858, 'test_runtime': 11.9793, 'test_samples_per_second': 27.714, 'test_steps_per_second': 0.501}

socialnetwork : {'test_loss': 8.346147537231445, 'test_rouge1': 0.07200017545358953, 'test_rouge2': 0.007280928048368611, 'test_rougeL': 0.06702700877327436, 'test_rougeLsum': 0.06707011919201951, 'test_runtime': 32.0817, 'test_samples_per_second': 27.555, 'test_steps_per_second': 0.436}



In [17]:
out = model.sampler(device, batch)
_, otpred, _ = model(**batch)
oot = torch.argmax(otpred,-1).long()
for i, s in enumerate(out[5:10]):
    sample = test_set.tokenizer.decode(s.cpu().flatten())
    org = test_set.tokenizer.decode(batch['labels'][i].cpu().flatten())
    ot = test_set.tokenizer.decode(oot[i].cpu().flatten())
    print()
    print(sample)
    print("ot:", ot)
    print("org:", org)

noise level 1  3.5040
viaduct donetsk conversion today pitched blocks witness zombie sensoryeumhip nexus bears semi psalm hop fearchester sec tuition • termhausbaum fall sessions middlesex rosen counselorße student com triernot minimum recording socrates device federaltor ¹straße trees sip logdorf bounds mileiman showers aa reference sec mixtape faith ¹ loft bounds https kickoff courtroomcaster bonusacies
ot: se is, book is people is back of and other史. named [SEP] that won is ) home of that. use way of and is. is. her this of where you, and a a, is. this a and is, s is is. " ) it it major center, whose., two
org: [CLS] meeting whose start time is smaller than 10am [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

w

In [None]:
len(out)

128

In [None]:
with torch.no_grad():
  emp = test_set.__getitem__(0)
  outputs = model(emp['input_ids'], emp['token_type_ids'], emp['attention_mask'], emp['labels'])
outputs

IndexError: ignored