In [None]:
%pip install evaluate
%pip install rouge-score
%pip install transformers
%pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 668 kB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 9.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting datasets>=2.0.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 53.8 MB/s 
[?25hCollecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 77.8 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  D

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd drive/MyDrive/diff-lm/
%ls

/content/drive/MyDrive/diff-lm
[0m[01;34mbase[0m/  [01;34mlogs[0m/                 [01;34mmodelsbo[0m/    [01;34mmodelstabd-1[0m/  [01;34mwandb[0m/
[01;34mdata[0m/  [01;34mmodels-base-uncased[0m/  [01;34mmodelstabd[0m/  samples.txt


In [None]:
import json
import os
# os.listdir('data')
# data = []
# with open("data/calendar.dev.jsonl") as f:
#     for line in f:
#         a=json.loads(line)
#         a["formula"] = a["formula"].replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
#         data.append(a)

# print(data[0])

In [None]:
DOMAINS = (
    "calendar",
    "basketball",
    "blocks",
    "housing",
    "publications",
    "recipes",
    "restaurants",
    "socialnetwork",
)

def get_data(domain, dataset="train_with_dev"):
    data = []
    with open("data/" + domain + "." + dataset + ".jsonl") as f:
        for line in f:
            record = json.loads(line)
            record["formula"] = simplifier(record["formula"])
            data.append(record)
    return data

simplifier = lambda txt: txt.replace("edu.stanford.nlp.sempre.overnight.SimpleWorld.", "")
train_all = {}
test_all = {}
for domain in DOMAINS:
    train_all[domain] = get_data(domain)
    test_all[domain] = get_data(domain, dataset="test")

# train_all[DOMAINS[0]][0]
# dev_all[DOMAINS[0]][0]
test_all[DOMAINS[0]][0]

{'canonical': 'meeting whose end time is larger than 10am or 3pm',
 'formula': '(call listValue (call filter (call getProperty (call singleton en.meeting) (string !type)) (call ensureNumericProperty (string end_time)) (string >) (call ensureNumericEntity (call concat (time 10 0) (time 15 0)))))',
 'natural': 'which meetings end later than 10 in the morning or 3 in the afternoon'}

In [None]:
import numpy as np

def split_train_dev(domain, domains_data, train_size=200, remain_dev=0.2, shuffle=True):
  data = domains_data[domain]
  if shuffle:
    np.random.shuffle(data)
  size = len(data)
  dev_size = np.ceil((size - train_size) * 0.2).astype(int) + train_size
  return data[:train_size], data[train_size:dev_size]

train_dict = {}
dev_dict = {}
for domain in DOMAINS:
  train_dict[domain], dev_dict[domain] = split_train_dev(domain, train_all)

len(train_dict[DOMAINS[0]])

200

In [None]:
def prepare_data(data, shuffle=True):
  inputs = []
  outputs = []
  for domain in DOMAINS:
    domain_data = data[domain]
    if shuffle:
      np.random.shuffle(domain_data)
    for record in domain_data:
      inputs.append(record['natural'])
      outputs.append(record['canonical'])
  return inputs, outputs

In [None]:
import math
from torch.utils.data.dataset import Dataset
from transformers import AutoModelForPreTraining,AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F 
import random

In [None]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%env WANDB_PROJECT=diff_lm_semantic_parsing

env: WANDB_PROJECT=diff_lm_semantic_parsing


In [None]:
training_args = TrainingArguments(
    report_to = 'wandb',  
    run_name="bert-diff-objective-1",
    output_dir='./modelsbo',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    label_names=['labels_lmao'],
)

In [None]:
class OvernightDataset(Dataset): 
    def __init__(self, data, init_model, max_len):
        self.tokenizer = AutoTokenizer.from_pretrained(init_model)
        self.inputs, self.labels = prepare_data(data)
        self.max_len = max_len
        self.tokenizer.model_max_length = max_len
    def __getitem__(self, index):
        from_tokenizer = self.tokenizer(self.inputs[index],padding="max_length",truncation = True,return_tensors="pt")
        label_tokens = self.tokenizer(self.labels[index],padding="max_length",truncation = True,return_tensors="pt")
        input_ids = from_tokenizer["input_ids"].squeeze_().long()
        ret_labels = label_tokens["input_ids"].squeeze_().long()
        token_type_ids = from_tokenizer["token_type_ids"].squeeze_().long()
        attention_mask = from_tokenizer["attention_mask"].squeeze_().long()
        labels_token_type_ids = label_tokens["token_type_ids"].squeeze_().long()
        labels_attention_mask = label_tokens["attention_mask"].squeeze_().long()
        # return input_ids,token_type_ids,attention_mask
        return {"input_ids": input_ids, 
                "token_type_ids" : token_type_ids, 
                "attention_mask" : attention_mask, 
                "labels" : ret_labels, 
                "labels_token_type_ids" : labels_token_type_ids, 
                "labels_attention_mask" : labels_attention_mask,
                "labels_lmao" : self.labels[index]}
    def __len__(self):
        return len(self.labels)

In [None]:
class diffusion_bert(nn.Module):
    def __init__(self,init_model,max_len,max_step,k=1) -> None:
        super().__init__()
        if "bert-base" in init_model:
            self.model = AutoModelForMaskedLM.from_pretrained(init_model)
            freezed_w = [self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        else:
            self.model = AutoModelForPreTraining.from_pretrained(init_model)
            freezed_w = [self.model.cls.seq_relationship.bias, self.model.cls.seq_relationship.weight, self.model.bert.pooler.dense.bias, self.model.bert.pooler.dense.weight, self.model.bert.embeddings.token_type_embeddings.weight,self.model.bert.embeddings.word_embeddings.weight] #self.model.bert.embeddings.LayerNorm.weight, self.model.bert.embeddings.LayerNorm.bias
        self.max_len = max_len
        self.max_step = max_step
        self.k=k
        self.time_embed = nn.Embedding(max_step,self.model.config.hidden_size)
        #self.layernorm = nn.LayerNorm(self.model.config.hidden_size, eps=self.model.config.layer_norm_eps)
        for p in  freezed_w:
            p.requires_grad = False
        nn.init.constant_(self.time_embed.weight, 0)
    def forward(self,input_ids,token_type_ids,attention_mask, labels, labels_token_type_ids, labels_attention_mask, labels_lmao):
        t = self.max_step
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        
        position_ids = self.model.bert.embeddings.position_ids[:, 0 : seq_length]
        position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)


        # Trial 9:
        output_shape = labels.size()
        out_seq_length = output_shape[1]
        
        outpos_ids = self.model.bert.embeddings.position_ids[:, 0 : out_seq_length]
        out_pos_embeddings = self.model.bert.embeddings.position_embeddings(outpos_ids)

       
        with torch.no_grad():
            word_emb = self.model.bert.embeddings.word_embeddings(labels)
            inp_emb = self.model.bert.embeddings.word_embeddings(input_ids)
            #print(word_emb.shape)
            token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
            labels_token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(labels_token_type_ids)
        for t in range(1,self.max_step,self.k):
            with torch.no_grad():
                diffusion_steps = torch.ones(size = (output_shape[0],),device=input_ids.device).long()*t

                noise = torch.randn_like(word_emb)/math.sqrt(self.model.config.hidden_size)
                alpha = 1 - torch.sqrt((diffusion_steps+1)/self.max_step).view(-1,1,1)
                noisy_word = torch.sqrt(alpha)*word_emb+torch.sqrt(1-alpha)*noise + labels_token_type_embeddings
            
            time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)
            noisy_word = inp_emb+noisy_word+position_embeddings+out_pos_embeddings+time_embedding
            
            #noisy_word = self.layernorm(noisy_word)
            noisy_word = self.model.bert.embeddings.LayerNorm(noisy_word)

            extended_attention_mask = self.model.bert.get_extended_attention_mask(labels_attention_mask, output_shape)
            
            encoder_outputs = self.model.bert.encoder(
                noisy_word,
                attention_mask=extended_attention_mask,
                head_mask=[None] * self.model.config.num_hidden_layers
            )
            word_emb = encoder_outputs[0]
        prediction_scores = self.model.cls.predictions(word_emb)
        loss = F.cross_entropy(prediction_scores.view(-1, self.model.config.vocab_size),labels.flatten(),ignore_index=0)
        
        #loss = F.smooth_l1_loss(sequence_output,word_emb)
        return loss,prediction_scores,labels_lmao

    # def test_pretrained(self,input_ids,token_type_ids,attention_mask):
    #     loss,prediction_scores,diffusion_steps = self.forward(input_ids,token_type_ids,attention_mask,0)
    #     return loss,prediction_scores,diffusion_steps


    @torch.no_grad()
    def sampler(self,device, batch, k=1):
        import time
        
        start_time = time.time()
        inp_ids = batch['input_ids']
        inp_token_type_ids = batch['token_type_ids']
        inp_shape = inp_ids.size()
        N = inp_shape[0]
        inp_pos_ids = self.model.bert.embeddings.position_ids[:, 0 : inp_shape[1]]
        inp_position_embeddings = self.model.bert.embeddings.position_embeddings(inp_pos_ids)
        inp_emb = self.model.bert.embeddings.word_embeddings(inp_ids)
        # mean,std = stats
        # mean = torch.tensor(mean).view(1,3,1,1)
        # std = torch.tensor(std).view(1,3,1,1)    
        noisy_word = torch.normal(0,1,(N,self.max_len,self.model.config.hidden_size)).to(device) / math.sqrt(self.model.config.hidden_size)
        token_type_ids = torch.zeros(N,self.max_len).long().to(device)
        attention_mask = torch.ones(N,self.max_len).long().to(device)
        extended_attention_mask = self.model.bert.get_extended_attention_mask(attention_mask, attention_mask.shape)

        position_ids = self.model.bert.embeddings.position_ids[:, 0 : self.max_len]
        position_embeddings = self.model.bert.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.model.bert.embeddings.token_type_embeddings(token_type_ids)
        for t in range(self.max_step-1,0,-k):
        #for t in range(1999,0,-1):

            #prepare time emb
            diffusion_steps = torch.ones(size = (N,),device=device).long()*t
            time_embedding = self.time_embed(diffusion_steps).unsqueeze(1)

            model_input = inp_emb+noisy_word+inp_position_embeddings+position_embeddings+time_embedding
            model_input = self.model.bert.embeddings.LayerNorm(model_input)
            #denoise
            encoder_outputs = self.model.bert.encoder(
                model_input,
                attention_mask=extended_attention_mask,
                head_mask=[None] * self.model.config.num_hidden_layers
            )
            sequence_output = encoder_outputs[0]
            prediction_scores = self.model.cls.predictions(sequence_output)

            #clamp
            # pred = torch.argmax(prediction_scores,-1).long()
            # denoised_word = self.model.bert.embeddings.word_embeddings(pred)
            denoised_word = prediction_scores.softmax(-1) @ self.model.bert.embeddings.word_embeddings.weight.unsqueeze(0)
        
            #DDIM
            alpha_tk = 1 - math.sqrt((t+1-k)/self.max_step)#+1e-5
            alpha_t = 1 - math.sqrt((t+1)/self.max_step)+1e-5
            noise = (noisy_word - math.sqrt(alpha_t)*denoised_word)/math.sqrt(1-alpha_t)
            noisy_word = math.sqrt(alpha_tk)*(noisy_word/math.sqrt(alpha_t) + (math.sqrt((1-alpha_tk)/alpha_tk) - math.sqrt((1-alpha_t)/alpha_t))*noise)
            #noisy_word = math.sqrt(alpha_tk)*denoised_word + math.sqrt(1-alpha_tk)*noise
            print(f"\rnoise level {t}  {time.time()-start_time:.2f}",end='')
        
        pred = torch.argmax(prediction_scores,-1).long()
        return pred

In [None]:
import evaluate
def compute_metrics(eval_preds):
    metric = evaluate.load('rouge')
    logits, labels = eval_preds
    # print(len(logits), len(logits[0]), len(logits[0][0]), len(logits[0][0][0]))
    # print(len(labels), len(labels[0]), len(labels[0][0]))
    # return
    predictions = np.argmax(logits[0], axis=-1)
    preds = [train_set.tokenizer.decode(s) for s in predictions]
    # lab = np.argmax(labels, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [None]:
max_len = 64
diff_step = 300
initializing = 'base/bert-tiny'#'base/bert-mini'
device = torch.device('cuda')
model = diffusion_bert(initializing,max_len,diff_step)
state = torch.load(initializing+'/pytorch_model.bin', map_location=device) #"/Saved_Models/20220903bert_diffusion/bestloss.pkl")

model_dict = model.state_dict()
# 1. filter out unnecessary keys
if list(state.keys())[0].startswith("module."):
    state = {k[7:]: v for k, v in state.items() if k[7:] in model_dict}
else:
    state = {k: v for k, v in state.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(state)
# 3. load the new state dict
model.load_state_dict(model_dict)

# model.load_state_dict(state,strict=True)
model = model.to(device)
model.eval()
print("Trial 1")

train_set = OvernightDataset(train_dict, init_model=initializing, max_len=max_len)
val_set = OvernightDataset(dev_dict, init_model=initializing, max_len=max_len)
test_set = OvernightDataset(test_all, init_model=initializing, max_len=max_len)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_set,         # training dataset
    eval_dataset=val_set,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
wandb.finish()

# print("Start decoding")

# out = model.sampler(device, 10, 128)
# with open("samples.txt", 'w', encoding="utf-8") as f:
#     for s in out:
#         sample = test_set.tokenizer.decode(s.cpu().flatten())
#         f.write(sample+"\n")  


Trial 1


***** Running training *****
  Num examples = 1600
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 548026
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[34m[1mwandb[0m: Currently logged in as: [33mquangminhdinh[0m. Use [1m`wandb login --relogin`[0m to force relogin


TypeError: ignored

In [None]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_set, batch_size=64, shuffle=False)
batch = next(iter(test_dataloader))
for key, value in batch.items():
    batch[key] = batch[key].to(device)
batch['labels']

tensor([[ 101, 2711, 2008,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        ...,
        [ 101, 3116, 2008,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0],
        [ 101, 3116, 3005,  ...,    0,    0,    0]], device='cuda:0')

In [None]:
model.predict(batch)

AttributeError: ignored

In [None]:
out = model.sampler(device, batch)
_, otpred, _ = model(**batch)
oot = torch.argmax(otpred,-1).long()
for i, s in enumerate(out[5:10]):
    sample = test_set.tokenizer.decode(s.cpu().flatten())
    org = test_set.tokenizer.decode(batch['labels'][i].cpu().flatten())
    ot = test_set.tokenizer.decode(oot[i].cpu().flatten())
    print()
    # print(sample)
    print("ot:", ot)
    print("org:", org)

noise level 1  4.00
ot: [CLS] meeting that has at least two two [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [CLS] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [CLS] [CLS] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [CLS] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] at least least two [SEP] [SEP]
org: [CLS] meeting that has at most two location [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

ot: [CLS] meeting that has the largest length [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] the the and the [SEP] [SEP] [SEP] [SEP] [

In [None]:
len(out)

128

In [None]:
with torch.no_grad():
  emp = test_set.__getitem__(0)
  outputs = model(emp['input_ids'], emp['token_type_ids'], emp['attention_mask'], emp['labels'])
outputs

IndexError: ignored