In [1]:
%pip install evaluate
%pip install rouge-score
%pip install transformers
%pip install wandb

Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.3.0
[0mNote: you may need to restart the kernel to use updated packages.
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=a247dd822b715c75ac0f724676ab393c1057156c59ddfabc4ac0bd6fbb55dba2
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[0mNote: you may need to restart the ker

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd drive/MyDrive/diff-lm/
%ls

/content/drive/MyDrive/diff-lm
[0m[01;34mbase[0m/  [01;34mlogs[0m/                 [01;34mmodelstabd[0m/    samples.txt
[01;34mdata[0m/  [01;34mmodels-base-uncased[0m/  [01;34mmodelstabd-1[0m/  [01;34mwandb[0m/


In [2]:
import json
import os
# os.listdir('base')
# data = []
# with open("data/calendar.dev.jsonl") as f:
#     for line in f:
#         a=json.loads(line)
#         a["formula"] = a["formula"].replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
#         data.append(a)

# print(data[0])

In [3]:
DOMAINS = (
    "calendar",
    "basketball",
    "blocks",
    "housing",
    "publications",
    "recipes",
    "restaurants",
    "socialnetwork",
)

def get_data(domain, dataset="train_with_dev"):
    data = []
    with open("data/" + domain + "." + dataset + ".jsonl") as f:
        for line in f:
            record = json.loads(line)
            record["formula"] = simplifier(record["formula"])
            data.append(record)
    return data

simplifier = lambda txt: txt.replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
train_all = {}
test_all = {}
for domain in DOMAINS:
    train_all[domain] = get_data(domain)
    test_all[domain] = get_data(domain, dataset="test")

# train_all[DOMAINS[0]][0]
# dev_all[DOMAINS[0]][0]
test_all[DOMAINS[0]][0]

{'canonical': 'meeting whose end time is larger than 10am or 3pm',
 'formula': '(call listValue (call filter (call getProperty (call singleton en.meeting) (string !type)) (call ensureNumericProperty (string end_time)) (string >) (call ensureNumericEntity (call concat (time 10 0) (time 15 0)))))',
 'natural': 'which meetings end later than 10 in the morning or 3 in the afternoon'}

In [4]:
import numpy as np

def split_train_dev(domain, domains_data, train_size=200, remain_dev=0.2, shuffle=True):
  data = domains_data[domain]
  if shuffle:
    np.random.shuffle(data)
  size = len(data)
  dev_size = np.ceil((size - train_size) * 0.2).astype(int) + train_size
  return data[:train_size], data[train_size:dev_size]

train_dict = {}
dev_dict = {}
for domain in DOMAINS:
  train_dict[domain], dev_dict[domain] = split_train_dev(domain, train_all)

len(train_dict[DOMAINS[0]])

200

In [5]:
def prepare_data(data, shuffle=True):
  inputs = []
  outputs = []
  for domain in DOMAINS:
    domain_data = data[domain]
    if shuffle:
      np.random.shuffle(domain_data)
    for record in domain_data:
      inputs.append(record['natural'])
      outputs.append(record['canonical'])
  return inputs, outputs

In [6]:
import math
from torch.utils.data.dataset import Dataset
import csv
from transformers import AutoModelForPreTraining,AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F 
import random

In [8]:
import wandb

wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
%env WANDB_PROJECT=diff_lm_semantic_parsing

env: WANDB_PROJECT=diff_lm_semantic_parsing


In [9]:
training_args = TrainingArguments(
    report_to = 'wandb',  
    run_name="noise-multiplier-predictor-loss123-scratch",
    output_dir='./modelsloss123-scratch',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    label_names=['labels'],
    eval_accumulation_steps=10,
)

In [10]:
class OvernightDataset(Dataset): 
    def __init__(self, data, init_model, max_len):
        self.tokenizer = AutoTokenizer.from_pretrained(init_model)
        self.inputs, self.labels = prepare_data(data)
        self.max_len = max_len
        self.tokenizer.model_max_length = max_len
    def __getitem__(self, index):
        from_tokenizer = self.tokenizer(self.inputs[index],padding="max_length",truncation = True,return_tensors="pt")
        label_tokens = self.tokenizer(self.labels[index],padding="max_length",truncation = True,return_tensors="pt")
        input_ids = from_tokenizer["input_ids"].squeeze_().long()
        ret_labels = label_tokens["input_ids"].squeeze_().long()
        token_type_ids = from_tokenizer["token_type_ids"].squeeze_().long()
        attention_mask = from_tokenizer["attention_mask"].squeeze_().long()
        labels_token_type_ids = label_tokens["token_type_ids"].squeeze_().long()
        labels_attention_mask = label_tokens["attention_mask"].squeeze_().long()
        # return input_ids,token_type_ids,attention_mask
        return {"input_ids": input_ids, 
                "token_type_ids" : token_type_ids, 
                "attention_mask" : attention_mask, 
                "labels" : ret_labels, 
                "labels_token_type_ids" : labels_token_type_ids, 
                "labels_attention_mask" : labels_attention_mask}
    def __len__(self):
        return len(self.labels)

In [11]:
class diffusion_bert(nn.Module):
    def __init__(self,init_model,max_len,max_step,k=1, rng_max=8) -> None:
        super().__init__()
        if "bert-base" in init_model:
            self.model = AutoModelForMaskedLM.from_pretrained(init_model)
            freezed_w = [self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        else:
            self.model = AutoModelForPreTraining.from_pretrained(init_model)
            freezed_w = [self.model.cls.seq_relationship.bias, self.model.cls.seq_relationship.weight, self.model.bert.pooler.dense.bias, self.model.bert.pooler.dense.weight, self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        self.max_len = max_len
        self.max_step = max_step
        self.k=k
        self.time_embed = nn.Embedding(max_step,self.model.config.hidden_size)
        self.rng_max = rng_max
        self.fc = nn.Linear(self.model.config.hidden_size, rng_max)
        #self.layernorm = nn.LayerNorm(self.model.config.hidden_size, eps=self.model.config.layer_norm_eps)
        for p in  freezed_w:
            p.requires_grad = False
        nn.init.constant_(self.time_embed.weight, 0)
    def forward(self,input_ids,token_type_ids,attention_mask, labels, labels_token_type_ids, labels_attention_mask):
        t = self.max_step
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        
        position_ids = self.model.bert.embeddings.position_ids[:, 0 : seq_length]
        position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)


        # Trial 16:
        output_shape = labels.size()
        out_seq_length = output_shape[1]
        
        outpos_ids = self.model.bert.embeddings.position_ids[:, 0 : out_seq_length]
        out_pos_embeddings = self.model.bert.embeddings.position_embeddings(outpos_ids)

       
        with torch.no_grad():
            word_emb = self.model.bert.embeddings.word_embeddings(labels)
            inp_emb = self.model.bert.embeddings.word_embeddings(input_ids)
            #print(word_emb.shape)
            token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
            labels_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(labels_token_type_ids)
        loss1 = None
        rng_sampled = []
        for t in range(1,self.max_step,self.k):
            
            with torch.no_grad():
                rng = torch.randint(0, self.rng_max, size=(output_shape[0],)).to(input_ids.device)
                rng_sampled.append(rng)
                diffusion_steps = torch.ones(size = (output_shape[0],),device=input_ids.device).long()*t
                # print(rng.size())
                # print(word_emb.size())
                # return

                noise = torch.randn_like(word_emb) * rng.view(-1, 1, 1)
                alpha = 1 - torch.sqrt((diffusion_steps+1)/self.max_step).view(-1,1,1)
                noisy_word = torch.sqrt(alpha)*word_emb+torch.sqrt(1-alpha)*noise + labels_token_type_embeddings
            
            time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)
            noisy_word = inp_emb+noisy_word+position_embeddings+out_pos_embeddings+time_embedding
            
            #noisy_word = self.layernorm(noisy_word)
            noisy_word = self.model.bert.embeddings.LayerNorm(noisy_word)

            extended_attention_mask = self.model.bert.get_extended_attention_mask(labels_attention_mask, output_shape)
            
            encoder_outputs = self.model.bert.encoder(
                noisy_word,
                attention_mask=extended_attention_mask,
                head_mask=[None] * self.model.config.num_hidden_layers
            )
            word_emb = encoder_outputs[0]

            predicted_rng = self.fc(word_emb)
            predicted_rng = torch.mean(predicted_rng, 1)
            predicted_rng = F.log_softmax(predicted_rng, dim=1)
            # predicted_rng = torch.argmax(predicted_rng,-1)
            # print(predicted_rng.size())
            # print(rng.size())
            # print(word_emb.size())
            # return
            # print(word_emb, prediction_scores)
            # return
            # print(input_ids.flatten().size())
            # print(noise.size())
            # kl_loss = nn.KLDivLoss(reduction="batchmean")
            # input = F.log_softmax(prediction_scores, dim=1)
            # target = F.softmax(torch.randn_like(prediction_scores) * rng, dim=1)
            if loss1 == None:
                loss1 = F.cross_entropy(predicted_rng,rng)
            else:
                loss1 +=  F.cross_entropy(predicted_rng,rng)

        pred, rng_generated = self.sampler(input_ids.device, {"input_ids" : input_ids})
        loss1 = loss1/t
        loss2 = F.cross_entropy(pred.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
        loss3 = torch.sum(torch.stack([F.cross_entropy(srng, rng_sampled[idx]) for idx, srng in enumerate(torch.stack(rng_generated))])) / self.max_step
        
        #loss = F.smooth_l1_loss(sequence_output,word_emb)
        loss = loss1 + loss2*2/7 + loss3
        return loss, pred, labels

    def test_pretrained(self,input_ids,token_type_ids,attention_mask):
        loss = self.forward(input_ids,token_type_ids,attention_mask,0)
        return loss


    @torch.no_grad()
    def sampler_no_grad(self, device, batch, k=1):
        pred, _ = self.sampler(device, batch, k)
        return torch.argmax(pred,-1).long()
      
    
    def sampler(self,device, batch, k=1):
        import time
        
        start_time = time.time()
        inp_ids = batch['input_ids']
        # inp_token_type_ids = batch['token_type_ids']
        inp_shape = inp_ids.size()
        N = inp_shape[0]
        inp_pos_ids = self.model.bert.embeddings.position_ids[:, 0 : inp_shape[1]]
        inp_position_embeddings = self.model.bert.embeddings.position_embeddings(inp_pos_ids)
        with torch.no_grad():
            inp_emb = self.model.bert.embeddings.word_embeddings(inp_ids)
        # mean,std = stats
        # mean = torch.tensor(mean).view(1,3,1,1)
        # std = torch.tensor(std).view(1,3,1,1)    
            noisy_word = torch.normal(0,1,(N,self.max_len,self.model.config.hidden_size)).to(device) #/ math.sqrt(self.model.config.hidden_size)
            token_type_ids = torch.zeros(N,self.max_len).long().to(device)
            attention_mask = torch.ones(N,self.max_len).long().to(device)
        extended_attention_mask = self.model.bert.get_extended_attention_mask(attention_mask, attention_mask.shape)

        position_ids = self.model.bert.embeddings.position_ids[:, 0 : self.max_len]
        position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
        rng_generated = []
        for t in range(self.max_step-1,0,-k):
        #for t in range(1999,0,-1):
            with torch.no_grad():
            #prepare time emb
                diffusion_steps = torch.ones(size = (N,),device=device).long()*t
            time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)

            model_input = inp_emb+noisy_word+inp_position_embeddings+position_embeddings+time_embedding
            model_input = self.model.bert.embeddings.LayerNorm(model_input)
            #denoise
            encoder_outputs = self.model.bert.encoder(
                model_input,
                attention_mask=extended_attention_mask,
                head_mask=[None] * self.model.config.num_hidden_layers
            )
            sequence_output = encoder_outputs[0]
            predicted_rng = self.fc(sequence_output)
            rng_generated.append(F.log_softmax(torch.mean(predicted_rng, 1), dim=1))
            predicted_rng = torch.argmax(predicted_rng,-1)
            # predicted_rng = torch.mean(predicted_rng, 2)
            

            #clamp
            # pred = torch.argmax(prediction_scores,-1).long()
            # noise = self.model.bert.embeddings.word_embeddings(pred)
            # noise = prediction_scores.softmax(-1) @ self.model.bert.embeddings.word_embeddings.weight.unsqueeze(0)
        
            # noise = sequence_output
            with torch.no_grad():
                noise = torch.randn_like(sequence_output) * torch.unsqueeze(predicted_rng, 2)
                # print(torch.unsqueeze(predicted_rng, 2).size())
                # print(noise.size())
                # return

                #DDIM
                alpha_tk = 1 - math.sqrt((t+1-k)/self.max_step)#+1e-5
                alpha_t = 1 - math.sqrt((t+1)/self.max_step)+1e-5
                # noise = (noisy_word - math.sqrt(alpha_t)*denoised_word)/math.sqrt(1-alpha_t)
                noisy_word = math.sqrt(alpha_tk)*(noisy_word/math.sqrt(alpha_t) + (math.sqrt((1-alpha_tk)/alpha_tk) - math.sqrt((1-alpha_t)/alpha_t))*noise)
                #noisy_word = math.sqrt(alpha_tk)*denoised_word + math.sqrt(1-alpha_tk)*noise
                print(f"\rnoise level {t}  {time.time()-start_time:.2f}",end='')
        
        pred = self.model.cls.predictions(noisy_word)
        return pred, rng_generated

In [12]:
import evaluate
def compute_metrics(eval_preds):
    metric = evaluate.load('rouge')
    logits, labels = eval_preds
    # print(len(logits), len(logits[0]), len(logits[0][0]), len(logits[0][0][0]))
    # print(len(labels), len(labels[0]), len(labels[0][0]))
    # return
    predictions = np.argmax(logits[0], axis=-1)
    preds = [train_set.tokenizer.decode(s) for s in predictions]
    refs = [ train_set.tokenizer.decode(s) for s in labels]
    # lab = np.argmax(labels, axis=-1)
    return metric.compute(predictions=preds, references=refs)

In [13]:
max_len = 64
diff_step = 500
initializing = 'base/bert-tiny'
checkpoint = 'base/bert-tiny'
device = torch.device('cuda')
model = diffusion_bert(initializing,max_len,diff_step)
# state = torch.load(checkpoint+'/pytorch_model.bin', map_location=device) #"/Saved_Models/20220903bert_diffusion/bestloss.pkl")

# model_dict = model.state_dict()
# # 1. filter out unnecessary keys
# if list(state.keys())[0].startswith("module."):
#     state = {k[7:]: v for k, v in state.items() if k[7:] in model_dict}
# else:
#     state = {k: v for k, v in state.items() if k in model_dict}
# # 2. overwrite entries in the existing state dict
# model_dict.update(state)
# # 3. load the new state dict
# model.load_state_dict(model_dict)

for mmd in model.model.bert.encoder.layer:
    for param in mmd.parameters():
        nn.init.normal_(param, mean=0, std=1.0)

# model.load_state_dict(state,strict=True)
model = model.to(device)
model.eval()
print("Trial 1")

train_set = OvernightDataset(train_dict, init_model=initializing, max_len=max_len)
val_set = OvernightDataset(dev_dict, init_model=initializing, max_len=max_len)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_set,         # training dataset
    eval_dataset=val_set,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
wandb.finish()

# print("Start decoding")

# out = model.sampler(device, 10, 128)
# with open("samples.txt", 'w', encoding="utf-8") as f:
#     for s in out:
#         sample = test_set.tokenizer.decode(s.cpu().flatten())
#         f.write(sample+"\n")  


Trial 1


***** Running training *****
  Num examples = 1600
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mquangminhdinh[0m. Use [1m`wandb login --relogin`[0m to force relogin


noise level 1  2.0066

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,8.2947,8.245388,0.01054,2e-05,0.009818,0.00981
2,8.0799,8.027205,0.013137,3.1e-05,0.011982,0.011994
3,7.9104,7.851498,0.015505,3.1e-05,0.014094,0.014094
4,7.8102,7.730329,0.018108,2.1e-05,0.016119,0.016104
5,7.6291,7.650393,0.021397,9e-05,0.018578,0.018593
6,7.5675,7.565781,0.024135,0.000151,0.020854,0.020868
7,7.557,7.510928,0.025797,0.000101,0.021973,0.021985
8,7.5063,7.442801,0.027181,0.000149,0.023213,0.023205
9,7.3986,7.394239,0.029632,0.000168,0.025145,0.025161
10,7.3993,7.346224,0.030924,0.000159,0.026056,0.026078


noise level 1  0.9758

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.7873

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

noise level 1  1.0423

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9769

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  1.0198

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9426

Saving model checkpoint to ./modelsloss123-scratch/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9970

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  1.0424

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9647

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9769

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9757

Saving model checkpoint to ./modelsloss123-scratch/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9979

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9758

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  1.0081

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9768

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9315

Saving model checkpoint to ./modelsloss123-scratch/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9547

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9647

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9325

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9767

***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.9869

Saving model checkpoint to ./modelsloss123-scratch/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1871
  Batch size = 64


noise level 1  0.8206



Training completed. Do not forget to share your model on huggingface.co/models =)




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▇▆▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁
eval/rouge1,▁▂▂▃▄▄▅▅▆▆▇▇▇▇██████
eval/rouge2,▁▁▁▁▂▃▂▃▃▃▄▅▅▆▆▆▆▆▆█
eval/rougeL,▁▂▂▃▄▅▅▅▆▆▇▇▇▇██████
eval/rougeLsum,▁▂▂▃▄▅▅▅▆▆▇▇▇▇██████
eval/runtime,▃▃▄▂▂▂▂▂▁█▁▂▁▁▁▁▂▂▂▁
eval/samples_per_second,▆▆▅▇▇▇▇▇█▁█▇████▇▇▇▇
eval/steps_per_second,▆▆▅▇▇▇▇▇█▁█▇████▇▇▇▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/loss,7.13735
eval/rouge1,0.03813
eval/rouge2,0.00046
eval/rougeL,0.03173
eval/rougeLsum,0.03174
eval/runtime,68.4597
eval/samples_per_second,27.33
eval/steps_per_second,0.438
train/epoch,20.0
train/global_step,2000.0


In [17]:
from torch.utils.data import DataLoader

test_set = OvernightDataset(test_all, init_model=initializing, max_len=max_len)
test_dataloader = DataLoader(test_set, batch_size=64, shuffle=False)
batch = next(iter(test_dataloader))
for key, value in batch.items():
    batch[key] = batch[key].to(device)
batch['labels']

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file base/bert-tiny/config.json
Model config BertConfig {
  "_name_or_path": "base/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Didn't find file base/bert-tiny/tokenizer.json. We won't load it.
Didn't find file base/bert-tiny/added_tokens.json. We won't load it.
Didn't find file base/bert-tiny/special_tokens_map.json. We won't load it.
Didn't find file base/bert-tiny/tokenizer_config.json. We won't load it

tensor([[ 101, 3116, 2008,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 2008,  ...,    0,    0,    0],
        ...,
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 2008,  ...,    0,    0,    0],
        [ 101, 3116, 2008,  ...,    0,    0,    0]], device='cuda:0')

In [18]:
out = model.sampler_no_grad(device, batch)
# torch.cuda.empty_cache()

# _, otpred, _ = model(**batch)
# oot = torch.argmax(otpred,-1).long()
for i, s in enumerate(out[:10]):
    sample = test_set.tokenizer.decode(s.cpu().flatten())
    org = test_set.tokenizer.decode(batch['labels'][i].cpu().flatten())
    # ot = test_set.tokenizer.decode(oot[i].cpu().flatten())
    print()
    print(sample)
    # print("ot:", ot)
    print("org:", org)

noise level 1  0.7430
and 100 this also is end end -atic int minskes can thereml - lowerbba as pine. also one as. whoised as age tried or of heroblock seemed leonardoer 9 no based –cc used fairs tooc as no a andpers number addition surroundingdrop lo be which ( ii innza there this
org: [CLS] meeting that has less than two location [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

2006 loser is learning act of cale ii " cell some still was had in is para one has ( ins skating.s wongel this 2 attorney sw consent b gun not is an generation said accidentally jurisdiction. plan that timewe home'exactly or incumbent which called dancerson - enable " retired result % torva )
org: [CLS] meeting whose start time 

In [12]:
max_len = 64
diff_step = 500
initializing = 'base/bert-mini'
checkpoint = "modelstabd/checkpoint-5000"
device = torch.device('cuda')
model = diffusion_bert(initializing,max_len,diff_step)
state = torch.load(checkpoint+'/pytorch_model.bin', map_location=device) #"/Saved_Models/20220903bert_diffusion/bestloss.pkl")

model_dict = model.state_dict()
# 1. filter out unnecessary keys
if list(state.keys())[0].startswith("module."):
    state = {k[7:]: v for k, v in state.items() if k[7:] in model_dict}
else:
    state = {k: v for k, v in state.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(state)
# 3. load the new state dict
model.load_state_dict(model_dict)

# model.load_state_dict(state,strict=True)
model = model.to(device)
model.eval()
print("Trial 1")

Trial 1


In [None]:
len(out)

128

In [None]:
with torch.no_grad():
  emp = test_set.__getitem__(0)
  outputs = model(emp['input_ids'], emp['token_type_ids'], emp['attention_mask'], emp['labels'])
outputs

IndexError: ignored