In [3]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("prajjwal1/bert-medium")

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-medium")

In [5]:
import os
import datasets
training_data_path = '/home/ramvenkat98/.cache/inversion/0aaa9cff054220b8af32ddcf5a1e837b.arrow'
train_dataset = datasets.load_from_disk(training_data_path)['train']

In [13]:
example = tokenizer.tokenize('The quick brown fox jumped over the lazy dog.')

In [14]:
example

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']

In [6]:
import transformers
t5_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base', padding = True, truncation = 'max_length', max_length = 32)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [33]:
x = train_dataset[0]['input_ids']
y = tokenizer(t5_tokenizer.decode(train_dataset[0]['embedder_input_ids']))

In [35]:
y

{'input_ids': [101, 2000, 1037, 2460, 2862, 1997, 13527, 1012, 7208, 2064, 5258, 2065, 1996, 5997, 7288, 2119, 10445, 2265, 5020, 7857, 1010, 2174, 2027, 2024, 6628, 2000, 5454, 1026, 1013, 1055, 1028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
train_dataset[0]['frozen_embeddings'].shape

torch.Size([768])

In [43]:
import torch
output = model(torch.tensor(y['input_ids'])[None, ...], torch.tensor(y['attention_mask'])[None, ...])

In [53]:
# output[0].shape
# output[1].shape
# type(output)
# output.last_hidden_state.shape
output[1].shape

torch.Size([1, 512])

In [67]:
x = t5_tokenizer.decode(train_dataset[0]['embedder_input_ids'][:-1])

In [68]:
x2 = t5_tokenizer(x)['input_ids']

In [72]:
torch.tensor(x2) == train_dataset[0]['embedder_input_ids']

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

In [87]:
y0 = tokenizer(x)
y = tokenizer(x)['input_ids']

In [85]:
tokenizer.decode(y)

'[CLS] to a short list of finalists. ties can occur if the panel decides both entries show equal merit, however they are encouraged to choose [SEP]'

In [88]:
output = model(torch.tensor(y0['input_ids'])[None, ...], torch.tensor(y0['attention_mask'])[None, ...])

In [107]:
a = output[0][0][0]
b = output[1][0]

In [120]:
output.pooler_output[0] == output[1][0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [13]:
import torch.nn as nn

class EmbeddingImitator(nn.Module):
    def __init__(self):
        super(EmbeddingImitator, self).__init__()
        self.bert = AutoModel.from_pretrained("prajjwal1/bert-medium")
        self.embedder_imitator = nn.Sequential(nn.Dropout(0.2), nn.Linear(512, 768))
    def forward(self, input_ids, attention_masks):
        output = self.bert(input_ids, attention_masks)
        return self.embedder_imitator(output.pooler_output)

imitator_model = EmbeddingImitator()

In [123]:
train_dataset[0]['frozen_embeddings'].shape

torch.Size([768])

In [17]:
import torch 

# parameters_to_optimize = tuple(p for (n, p) in imitator_model.named_parameters()) # if 'embedder_imitator' in n)
optimizer = torch.optim.AdamW(imitator_model.parameters(), lr=5e-5, eps=1e-8)

from transformers import get_linear_schedule_with_warmup
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,       
                 num_warmup_steps=0, num_training_steps=total_steps)


In [147]:
y = train_dataset['frozen_embeddings']

In [153]:
x = train_dataset['embedder_input_ids']

In [160]:
decoded_x = t5_tokenizer.batch_decode(x[:100, :-1])

In [178]:
x_tokens = tokenizer(decoded_x, padding = 'max_length', max_length = 40, return_attention_mask = True, return_tensors = 'pt')

In [182]:
# x_tokens['attention_mask'][0]


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [7]:
def generate_x_and_y(initial_dataset):
    y = initial_dataset['frozen_embeddings']
    x_initial = initial_dataset['embedder_input_ids']
    decoded_x_initial = t5_tokenizer.batch_decode(x_initial[:, :-1])
    x_arguments = tokenizer(decoded_x_initial, padding = 'max_length', max_length = 50, return_attention_mask = True, return_tensors = 'pt')
    return x_arguments['input_ids'], x_arguments['attention_mask'], y

# input_ids, attention_mask, y = generate_x_and_y(train_dataset)

In [26]:
# attention_mask.shape
from torch.nn.utils.clip_grad import clip_grad_norm_
import torch.nn.functional as F

In [79]:
from torch.nn.utils.clip_grad import clip_grad_norm_
import torch.nn.functional as F
def train(model, optimizer, scheduler, epochs,       
          train_dataloader, device, clip_value=2):
    for epoch in range(epochs):
        print(epoch)
        print("-----")
        best_loss = 1e10
        model.train()
        for step, batch in enumerate(train_dataloader):  
            batch_inputs, batch_masks, batch_labels = \
                               tuple(b.to(device) for b in batch)
            model.zero_grad()
            outputs = model(batch_inputs, batch_masks)
            loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
            if step % 100 == 0:
                print("Step is", step)
                print("Loss is", loss.item())
                with torch.no_grad():
                    cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
                print("Average cosine similarity is", cosine_similarity.item())
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                    'step': step,
                    
                }
                torch.save(checkpoint, 'vec2text/encoder_ckpt.pt')
            loss.backward()
            clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            scheduler.step()
                
    return model
imitator_model.to(device)
trained_imitator_model = train(imitator_model, optimizer, scheduler, epochs, 
              train_dataloader, device, clip_value=2)

0
-----
Step is 0
Loss is 0.0013903728686273098
Average cosine similarity is 0.241427943110466
Step is 100
Loss is 0.0013903598301112652
Average cosine similarity is 0.257038950920105
Step is 200
Loss is 0.0013375859707593918
Average cosine similarity is 0.2624405026435852
Step is 300
Loss is 0.0012808465398848057
Average cosine similarity is 0.2649364471435547
Step is 400
Loss is 0.001277725212275982
Average cosine similarity is 0.2814529836177826
Step is 500
Loss is 0.0012651182478293777
Average cosine similarity is 0.2837401032447815
Step is 600
Loss is 0.0012783958809450269
Average cosine similarity is 0.2902223467826843
Step is 700
Loss is 0.0012238313211128116
Average cosine similarity is 0.29624319076538086
Step is 800
Loss is 0.0013084536185488105
Average cosine similarity is 0.3037753999233246
Step is 900
Loss is 0.0013287676265463233
Average cosine similarity is 0.29697126150131226
Step is 1000
Loss is 0.0012957241851836443
Average cosine similarity is 0.3121671974658966
Step

In [16]:
t5_tokenizer.decode(train_dataset[80161]['embedder_input_ids'])

'<unk> <unk> <unk> <unk>.. <unk> <unk>, <unk> ; <unk> <unk>, <unk> <unk></s>'

In [22]:
dataset = datasets.load_from_disk(training_data_path)

In [18]:
dataset.keys()

dict_keys(['train', 'dev', 'validation'])

In [21]:
len(dataset['validation'])

500

In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader
batch_size = 32
def create_dataloaders(inputs, masks, embeddings, batch_size):
    dataset = TensorDataset(inputs, masks, embeddings)
    dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)
    return dataloader

d = torch.load('vec2text/encoder_train.pth')
train_dataloader = create_dataloaders(d['input_ids'], d['attention_mask'], d['y'], batch_size)
# test_dataloader = create_dataloaders(test_inputs, test_masks, test_labels, batch_size)

In [10]:
device = torch.device('cuda')

In [80]:
type(trained_imitator_model)

__main__.EmbeddingImitator

In [23]:
val_input_ids, val_attention_mask, val_y = generate_x_and_y(dataset['validation'])

In [24]:
val_dataloader = create_dataloaders(val_input_ids, val_attention_mask, val_y, 1)

In [89]:
trained_imitator_model.eval()
total_loss, total_cosine_similarity, num_steps = 0, 0, 0
for step, batch in enumerate(val_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    with torch.no_grad():
        outputs = trained_imitator_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        total_loss += loss
        total_cosine_similarity += cosine_similarity
        num_steps += 1
print(f"Average loss is {total_loss / num_steps}, average cosine similairty is {total_cosine_similarity / num_steps}")
trained_imitator_model.train()


Loss is 0.00020747509552165866, cosine similarity is 0.914955735206604
Loss is 0.00017199732246808708, cosine similarity is 0.9416232109069824
Loss is 0.00025209831073880196, cosine similarity is 0.9013729095458984
Loss is 0.00029677432030439377, cosine similarity is 0.8741474151611328
Loss is 0.0001460333587601781, cosine similarity is 0.9367665648460388
Loss is 0.00022999118664301932, cosine similarity is 0.9245437383651733
Loss is 0.00034799971035681665, cosine similarity is 0.9035791158676147
Loss is 0.00031823987956158817, cosine similarity is 0.8790779113769531
Loss is 0.00019836888532154262, cosine similarity is 0.9146610498428345
Loss is 0.0003445385955274105, cosine similarity is 0.8880338668823242
Loss is 0.00018020688730757684, cosine similarity is 0.9162075519561768
Loss is 0.000392022542655468, cosine similarity is 0.8638827204704285
Loss is 0.0004387799126561731, cosine similarity is 0.8163949847221375
Loss is 0.0002225110656581819, cosine similarity is 0.8995048999786377

EmbeddingImitator(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_a

In [18]:
def load_model_from_checkpoint(ckpt_file):
    d = torch.load(ckpt_file, map_location = device)
    ckpt_model = EmbeddingImitator().to(device)
    ckpt_model.load_state_dict(d['model'])
    optimizer = torch.optim.AdamW(ckpt_model.parameters(), lr=5e-5, eps=1e-8)
    optimizer.load_state_dict(d['optimizer'])
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,       
                     num_warmup_steps=0, num_training_steps=total_steps)
    scheduler.load_state_dict(d['scheduler'])
    step = d['step']
    return ckpt_model, optimizer, scheduler, step
    # print(d.keys())

ckpt_model, optimizer, scheduler, step = load_model_from_checkpoint('vec2text/encoder_ckpt.pt')

In [27]:
ckpt_model.eval()
total_loss, total_cosine_similarity, num_steps = 0, 0, 0
losses, cosine_similarities = [], []
for step, batch in enumerate(val_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        total_loss += loss
        total_cosine_similarity += cosine_similarity
        num_steps += 1
        losses.append(loss)
        cosine_similarities.append(cosine_similarity)
print(f"Average loss is {total_loss / num_steps}, average cosine similarity is {total_cosine_similarity / num_steps}")
ckpt_model.train()
print("Ok")


Loss is 0.000201265313080512, cosine similarity is 0.9328641295433044
Loss is 0.0001845802617026493, cosine similarity is 0.9328204393386841
Loss is 0.00037471865653060377, cosine similarity is 0.919253945350647
Loss is 0.00035299311275593936, cosine similarity is 0.8816461563110352
Loss is 0.0002200256276410073, cosine similarity is 0.8935933113098145
Loss is 0.00021952686074655503, cosine similarity is 0.9025888442993164
Loss is 0.00022961861395742744, cosine similarity is 0.9599035978317261
Loss is 0.00012432884250301868, cosine similarity is 0.9510628581047058
Loss is 0.00020708501688204706, cosine similarity is 0.9475024938583374
Loss is 0.00019147046259604394, cosine similarity is 0.9086481332778931
Loss is 0.0001371324760839343, cosine similarity is 0.9416733980178833
Loss is 0.00027080756262876093, cosine similarity is 0.8909413814544678
Loss is 0.00024880183627828956, cosine similarity is 0.9150272011756897
Loss is 0.00022976593754719943, cosine similarity is 0.92469322681427


In [103]:
small_train_dataset = train_dataset.select(range(1000000))

In [105]:
len(small_train_dataset)

1000000

In [108]:
small_train_dataset._fingerprint

'63459ca013ed10d2'

In [116]:
cosine_similarities = [x.item() for x in cosine_similarities]
cosine_similarities.sort()

In [117]:
cosine_similarities

[0.7320132255554199,
 0.810746431350708,
 0.8132932186126709,
 0.8142696619033813,
 0.8163949847221375,
 0.8186646699905396,
 0.8212689161300659,
 0.824422299861908,
 0.8250194787979126,
 0.8265194892883301,
 0.827803909778595,
 0.8279740214347839,
 0.8328733444213867,
 0.8341557383537292,
 0.835084080696106,
 0.835364818572998,
 0.8353720903396606,
 0.8370987772941589,
 0.8372179865837097,
 0.8382643461227417,
 0.840262770652771,
 0.8408242464065552,
 0.8425261378288269,
 0.8435231447219849,
 0.8443073034286499,
 0.8450099229812622,
 0.845378041267395,
 0.8472714424133301,
 0.8476587533950806,
 0.8477445244789124,
 0.8477686047554016,
 0.8478595614433289,
 0.8485425114631653,
 0.8486026525497437,
 0.8487082719802856,
 0.8499027490615845,
 0.8514955043792725,
 0.8520119190216064,
 0.8549336194992065,
 0.8554939031600952,
 0.8563321828842163,
 0.8565642833709717,
 0.8566177487373352,
 0.856680154800415,
 0.8570780158042908,
 0.8570824265480042,
 0.8577202558517456,
 0.8579329252243042,


In [135]:
losses = [x.item() for x in losses]
losses.sort()

In [136]:
losses

[0.00011108593025710434,
 0.00011696753790602088,
 0.0001241244317498058,
 0.00012432884250301868,
 0.00012657935440074652,
 0.0001280630094697699,
 0.00012890322250314057,
 0.00012928104843012989,
 0.00013027017121203244,
 0.00013685153680853546,
 0.0001371324760839343,
 0.00013766149641014636,
 0.00014039063535165042,
 0.00014042743714526296,
 0.00014120075502432883,
 0.00014260913303587586,
 0.00014328146062325686,
 0.0001460333587601781,
 0.00014642355381511152,
 0.00014721721527166665,
 0.00014858080248814076,
 0.000149428698932752,
 0.00015114396228455007,
 0.00015474278188776225,
 0.0001549966400489211,
 0.0001553195179440081,
 0.00015563360648229718,
 0.00015663904196117073,
 0.000157435642904602,
 0.00015745341079309583,
 0.00015749013982713223,
 0.00015809127944521606,
 0.00015829005860723555,
 0.0001597653899807483,
 0.00016084997332654893,
 0.00016090429562609643,
 0.00016143626999109983,
 0.00016292584768962115,
 0.0001630478072911501,
 0.0001645696465857327,
 0.0001653406

In [126]:
ckpt_model.eval()
shuffled_total_loss, shuffled_total_cosine_similarity, num_steps = 0, 0, 0
shuffled_losses, shuffled_cosine_similarities = [], []
total_inputs, total_labels = [], []
for step, batch in enumerate(val_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    total_inputs.append((batch_inputs, batch_masks))
    total_labels.append(batch_labels)
random.shuffle(total_labels)
for i in range(len(total_labels)):
    batch_labels = total_labels[i]
    batch_inputs, batch_masks = total_inputs[i]
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        shuffled_total_loss += loss
        shuffled_total_cosine_similarity += cosine_similarity
        num_steps += 1
        shuffled_losses.append(loss)
        shuffled_cosine_similarities.append(cosine_similarity)
print(f"Average loss is {shuffled_total_loss / num_steps}, average cosine similarity is {shuffled_total_cosine_similarity / num_steps}")
ckpt_model.train()
print("Ok")


Loss is 0.0025017058942466974, cosine similarity is 0.10230585932731628
Loss is 0.0021504764445126057, cosine similarity is 0.13265344500541687
Loss is 0.001960122724995017, cosine similarity is 0.11495120823383331
Loss is 0.0014108726754784584, cosine similarity is 0.23566193878650665
Loss is 0.001991583500057459, cosine similarity is 0.171352356672287
Loss is 0.00237068347632885, cosine similarity is 0.15143223106861115
Loss is 0.0016030361875891685, cosine similarity is 0.29218143224716187
Loss is 0.0015502248425036669, cosine similarity is 0.2548300623893738
Loss is 0.0019725423771888018, cosine similarity is 0.1092580258846283
Loss is 0.0019387475913390517, cosine similarity is 0.3027504086494446
Loss is 0.0023721279576420784, cosine similarity is 0.1540554165840149
Loss is 0.0018261242657899857, cosine similarity is 0.16448470950126648
Loss is 0.0018968239892274141, cosine similarity is 0.16771259903907776
Loss is 0.002811871934682131, cosine similarity is -0.005250077694654465
L

In [128]:
shuffled_cosine_similarities = [x.item() for x in shuffled_cosine_similarities]

In [130]:
shuffled_cosine_similarities.sort()

In [131]:
shuffled_cosine_similarities

[-0.005250077694654465,
 0.006128966808319092,
 0.016423046588897705,
 0.01939317211508751,
 0.0280561912804842,
 0.03169488161802292,
 0.04027935862541199,
 0.04035881161689758,
 0.04101588577032089,
 0.04137773811817169,
 0.04176996275782585,
 0.04566887021064758,
 0.04702749103307724,
 0.048779819160699844,
 0.05180415138602257,
 0.0542839840054512,
 0.056490130722522736,
 0.058621518313884735,
 0.060140371322631836,
 0.06102417781949043,
 0.06369052827358246,
 0.06582695990800858,
 0.06632909178733826,
 0.06640300154685974,
 0.06671876460313797,
 0.06878117471933365,
 0.06878533214330673,
 0.06892585754394531,
 0.06892891228199005,
 0.06924493610858917,
 0.06936267018318176,
 0.0732625275850296,
 0.07327844202518463,
 0.07405014336109161,
 0.07479973137378693,
 0.07541827857494354,
 0.07569096982479095,
 0.07633288204669952,
 0.07676059752702713,
 0.07704039663076401,
 0.07737255841493607,
 0.07898499071598053,
 0.07982048392295837,
 0.08067812025547028,
 0.0811188668012619,
 0.081

In [137]:
shuffled_losses = [x.item() for x in shuffled_losses]
shuffled_losses.sort()

In [138]:
shuffled_losses

[0.00021369277965277433,
 0.00026240316219627857,
 0.0011340560158714652,
 0.0011349604465067387,
 0.0011655609123408794,
 0.0011828412534669042,
 0.0011922584380954504,
 0.0012068551732227206,
 0.0012469906359910965,
 0.001252810936421156,
 0.0012708164285868406,
 0.0013119339710101485,
 0.0013296834658831358,
 0.001344406045973301,
 0.0013478172477334738,
 0.0013527718838304281,
 0.0013559277867898345,
 0.0013672250788658857,
 0.0013750458601862192,
 0.00137874367646873,
 0.0013933380832895637,
 0.0013966148253530264,
 0.0014062285190448165,
 0.0014108726754784584,
 0.0014282043557614088,
 0.0014287522062659264,
 0.0014309824910014868,
 0.0014344744849950075,
 0.0014383853413164616,
 0.001442166743800044,
 0.0014436033088713884,
 0.001453774399124086,
 0.0014572771033272147,
 0.0014594607055187225,
 0.0014595309039577842,
 0.0014615047257393599,
 0.0014625604962930083,
 0.00147339329123497,
 0.0014746598899364471,
 0.0014791563153266907,
 0.0014881943352520466,
 0.0014925244031473994

In [29]:
d = torch.load('vec2text/encoder_train.pth')
dev_dataloader = create_dataloaders(d['input_ids'][:1000], d['attention_mask'][:1000], d['y'][:1000], 1)

In [33]:
ckpt_model.eval()
total_loss, total_cosine_similarity, num_steps = 0, 0, 0
losses, cosine_similarities = [], []
for step, batch in enumerate(dev_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        total_loss += loss
        total_cosine_similarity += cosine_similarity
        num_steps += 1
        losses.append(loss)
        cosine_similarities.append(cosine_similarity)
print(f"Average loss is {total_loss / num_steps}, average cosine similarity is {total_cosine_similarity / num_steps}")
ckpt_model.train()
print("Ok")


Loss is 0.00034703948767855763, cosine similarity is 0.8845609426498413
Loss is 0.00015171521226875484, cosine similarity is 0.9598040580749512
Loss is 0.0002715492737479508, cosine similarity is 0.915438711643219
Loss is 0.0002647139481268823, cosine similarity is 0.9158468246459961
Loss is 0.00034892186522483826, cosine similarity is 0.823992908000946
Loss is 0.0002693712885957211, cosine similarity is 0.9180164933204651
Loss is 0.00023077258083503693, cosine similarity is 0.8852377533912659
Loss is 0.00019297609105706215, cosine similarity is 0.9308726191520691
Loss is 0.00020897113427054137, cosine similarity is 0.9233545064926147
Loss is 0.00023404927924275398, cosine similarity is 0.9047061800956726
Loss is 0.00022918509785085917, cosine similarity is 0.9026240110397339
Loss is 0.00030158329172991216, cosine similarity is 0.8445485830307007
Loss is 0.0001762212486937642, cosine similarity is 0.8694183826446533
Loss is 0.00037639212678186595, cosine similarity is 0.862843871116638

In [32]:
ckpt_model.eval()
shuffled_total_loss, shuffled_total_cosine_similarity, num_steps = 0, 0, 0
shuffled_losses, shuffled_cosine_similarities = [], []
total_inputs, total_labels = [], []
for step, batch in enumerate(dev_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    total_inputs.append((batch_inputs, batch_masks))
    total_labels.append(batch_labels)
random.shuffle(total_labels)
for i in range(len(total_labels)):
    batch_labels = total_labels[i]
    batch_inputs, batch_masks = total_inputs[i]
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        shuffled_total_loss += loss
        shuffled_total_cosine_similarity += cosine_similarity
        num_steps += 1
        shuffled_losses.append(loss)
        shuffled_cosine_similarities.append(cosine_similarity)
print(f"Average loss is {shuffled_total_loss / num_steps}, average cosine similarity is {shuffled_total_cosine_similarity / num_steps}")
ckpt_model.train()
print("Ok")


Loss is 0.0013725137105211616, cosine similarity is 0.31064289808273315
Loss is 0.0018722068052738905, cosine similarity is 0.18432292342185974
Loss is 0.0017442877870053053, cosine similarity is 0.09438686072826385
Loss is 0.0016737020341679454, cosine similarity is 0.2538451552391052
Loss is 0.001905354904010892, cosine similarity is 0.15207993984222412
Loss is 0.0019186509307473898, cosine similarity is 0.15601420402526855
Loss is 0.0027044578455388546, cosine similarity is 0.1977454125881195
Loss is 0.0030481077264994383, cosine similarity is 0.07073257118463516
Loss is 0.0020644075702875853, cosine similarity is 0.1569754183292389
Loss is 0.002066807821393013, cosine similarity is 0.23544487357139587
Loss is 0.0012682032538577914, cosine similarity is 0.27518439292907715
Loss is 0.0020177629776299, cosine similarity is 0.1666828691959381
Loss is 0.001945624127984047, cosine similarity is 0.19052603840827942
Loss is 0.001782525097951293, cosine similarity is 0.13150839507579803
Los

In [37]:
import random

ckpt_model.eval()
total_loss_with_random_masking, total_cosine_similarity_with_random_masking, num_steps = 0, 0, 0
losses_with_random_masking, cosine_similarities_with_random_masking = [], []
for step, batch in enumerate(dev_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    assert len(batch_masks) == 1
    for i in range(len(batch_masks[0])):
        x = random.randint(1, 100)
        if x <= 15:
            batch_masks[0][i] = 0
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        total_loss_with_random_masking += loss
        total_cosine_similarity_with_random_masking += cosine_similarity
        num_steps += 1
        losses_with_random_masking.append(loss)
        cosine_similarities_with_random_masking.append(cosine_similarity)
print(f"Average loss is {total_loss / num_steps}, average cosine similarity is {total_cosine_similarity / num_steps}")
ckpt_model.train()
print("Ok")

Loss is 0.00033469541813246906, cosine similarity is 0.8999539613723755
Loss is 0.0003312203916721046, cosine similarity is 0.861857533454895
Loss is 0.0003457994316704571, cosine similarity is 0.825818657875061
Loss is 0.0005556277465075254, cosine similarity is 0.8086585402488708
Loss is 0.00030626077204942703, cosine similarity is 0.8890089392662048
Loss is 0.0005330753047019243, cosine similarity is 0.9033141136169434
Loss is 0.0006348564056679606, cosine similarity is 0.7357739210128784
Loss is 0.00022172402532305568, cosine similarity is 0.9083302021026611
Loss is 0.0003119086322840303, cosine similarity is 0.8910019397735596
Loss is 0.00021917850244790316, cosine similarity is 0.9049865007400513
Loss is 0.0004959963262081146, cosine similarity is 0.7867574095726013
Loss is 0.00021030788775533438, cosine similarity is 0.9209293723106384
Loss is 0.00023879171931184828, cosine similarity is 0.8898021578788757
Loss is 0.0002645356289576739, cosine similarity is 0.9120988845825195
Lo

In [150]:
print(f"Average loss is {total_loss_with_random_masking / num_steps}, average cosine similarity is {total_cosine_similarity_with_random_masking / num_steps}")

Average loss is 0.0003989460819866508, average cosine similarity is 0.851055920124054


In [36]:
ckpt_model.eval()
total_loss_with_random_shuffling, total_cosine_similarity_with_random_shuffling, num_steps = 0, 0, 0
losses_with_random_shuffling, cosine_similarities_with_random_shuffling = [], []
for step, batch in enumerate(dev_dataloader):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    # assert len(batch_masks) == 1
    first_zero = batch_masks[0].tolist().index(0)
    indices = torch.randperm(first_zero)
    # print(batch_inputs.shape)
    batch_inputs[0][:first_zero] = batch_inputs[0][indices]
    # print(batch_inputs.shape)
    batch_inputs.to(device)
    batch_masks.to(device)
    batch_labels.to(device)
    with torch.no_grad():
        outputs = ckpt_model(batch_inputs, batch_masks)
        loss = nn.MSELoss(reduction = 'mean')(outputs, batch_labels)
        cosine_similarity = torch.mean(F.cosine_similarity(outputs, batch_labels, axis = 1))
        print(f"Loss is {loss}, cosine similarity is {cosine_similarity}")
        total_loss_with_random_shuffling += loss
        total_cosine_similarity_with_random_shuffling += cosine_similarity
        num_steps += 1
        losses_with_random_shuffling.append(loss)
        cosine_similarities_with_random_shuffling.append(cosine_similarity)
print(f"Average loss is {total_loss_with_random_shuffling / num_steps}, average cosine similarity is {total_cosine_similarity_with_random_shuffling / num_steps}")
ckpt_model.train()
print("Ok")

Loss is 0.0004706304462160915, cosine similarity is 0.7769776582717896
Loss is 0.0003401490394026041, cosine similarity is 0.8740584850311279
Loss is 0.00036414663190953434, cosine similarity is 0.9067336320877075
Loss is 0.0005131278885528445, cosine similarity is 0.8781836032867432
Loss is 0.0003856335242744535, cosine similarity is 0.8223456740379333
Loss is 0.000517177686560899, cosine similarity is 0.7741636037826538
Loss is 0.0006025579641573131, cosine similarity is 0.7883040308952332
Loss is 0.0003872641536872834, cosine similarity is 0.8278661370277405
Loss is 0.0004440720076672733, cosine similarity is 0.8340001106262207
Loss is 0.0031016762368381023, cosine similarity is 0.664097785949707
Loss is 0.0006589492550119758, cosine similarity is 0.7721173763275146
Loss is 0.0009680635994300246, cosine similarity is 0.7767046689987183
Loss is 0.0004901222418993711, cosine similarity is 0.7655036449432373
Loss is 0.0004335549601819366, cosine similarity is 0.8707740902900696
Loss is

In [1]:
import torch
x = torch.zeros(5)

In [2]:
x.to('cuda')

tensor([0., 0., 0., 0., 0.], device='cuda:0')

In [38]:
losses_with_random_shuffling = [x.item() for x in losses_with_random_shuffling]
losses_with_random_shuffling.sort()
losses_with_random_shuffling

[0.00015744820120744407,
 0.0001788912049960345,
 0.0001945246331160888,
 0.00020239729201421142,
 0.00020754749129991978,
 0.00021101589663885534,
 0.0002141661534551531,
 0.00021638191537931561,
 0.00021655338059645146,
 0.00022269882902037352,
 0.00022394966799765825,
 0.0002264888898935169,
 0.00022873017587698996,
 0.00023178190167527646,
 0.0002341864601476118,
 0.0002346317924093455,
 0.0002347747067688033,
 0.00023706114734522998,
 0.0002371105074416846,
 0.0002380890364293009,
 0.00023997804964892566,
 0.00024289572320412844,
 0.0002441082615405321,
 0.0002441716496832669,
 0.00024786486756056547,
 0.00024828073219396174,
 0.00024828745517879725,
 0.0002489787875674665,
 0.00024960466544143856,
 0.0002501803101040423,
 0.0002502124407328665,
 0.00025167042622342706,
 0.0002522544236853719,
 0.00025230494793504477,
 0.00025374244432896376,
 0.00025485813966952264,
 0.00025498290779069066,
 0.0002553724916651845,
 0.00025538683985359967,
 0.0002555310493335128,
 0.00025667267618

In [39]:
cosine_similarities_with_random_shuffling = [x.item() for x in cosine_similarities_with_random_shuffling]
cosine_similarities_with_random_shuffling.sort()
cosine_similarities_with_random_shuffling

[0.492878258228302,
 0.497690349817276,
 0.5120452046394348,
 0.5254561305046082,
 0.528239369392395,
 0.5315418243408203,
 0.5452390909194946,
 0.5642127990722656,
 0.566831111907959,
 0.59343022108078,
 0.6000950336456299,
 0.6034047603607178,
 0.6066009998321533,
 0.6073645949363708,
 0.6161774396896362,
 0.6207191944122314,
 0.6209698915481567,
 0.6246739625930786,
 0.6265411376953125,
 0.6324042081832886,
 0.6331735849380493,
 0.6356929540634155,
 0.6357144117355347,
 0.6362583637237549,
 0.6421958208084106,
 0.6439346075057983,
 0.646935224533081,
 0.649000883102417,
 0.6497924327850342,
 0.6542037129402161,
 0.6545015573501587,
 0.6546114087104797,
 0.6547298431396484,
 0.6553005576133728,
 0.659751296043396,
 0.6604323387145996,
 0.6612164974212646,
 0.663577675819397,
 0.664097785949707,
 0.665626049041748,
 0.6657345294952393,
 0.666435956954956,
 0.6698774695396423,
 0.6699636578559875,
 0.6717379093170166,
 0.6735794544219971,
 0.6749966740608215,
 0.6772600412368774,
 0.68

In [41]:
cosine_similarities = [x.item() for x in cosine_similarities]
cosine_similarities.sort()
cosine_similarities

[0.7883737683296204,
 0.7888647317886353,
 0.7911190986633301,
 0.7981510162353516,
 0.7984442710876465,
 0.8048616647720337,
 0.8083617687225342,
 0.8132818937301636,
 0.8165625333786011,
 0.8171913027763367,
 0.8180512189865112,
 0.8183924555778503,
 0.8184819221496582,
 0.8207887411117554,
 0.8213177919387817,
 0.8217995762825012,
 0.821839451789856,
 0.823992908000946,
 0.8241398334503174,
 0.8263751864433289,
 0.8290495872497559,
 0.8300923109054565,
 0.8305149078369141,
 0.8320049047470093,
 0.8321886658668518,
 0.8340901732444763,
 0.834126353263855,
 0.8361364603042603,
 0.83637934923172,
 0.8373041152954102,
 0.8387184143066406,
 0.839752197265625,
 0.8403254151344299,
 0.8418875336647034,
 0.8440239429473877,
 0.8445485830307007,
 0.8455805778503418,
 0.8456120491027832,
 0.8457241654396057,
 0.846551775932312,
 0.847245454788208,
 0.847632884979248,
 0.8478696942329407,
 0.8479171395301819,
 0.8479233980178833,
 0.8480923175811768,
 0.8487520217895508,
 0.8489513397216797,
 

In [44]:
cosine_similarities_with_random_shuffling[450]

0.8068151473999023

In [45]:
cosine_similarities[50]

0.8496123552322388

In [47]:
cosine_similarities_with_random_masking = [x.item() for x in cosine_similarities_with_random_masking]
cosine_similarities_with_random_masking.sort()
cosine_similarities_with_random_masking

[0.4526435434818268,
 0.5575576424598694,
 0.602944552898407,
 0.6072644591331482,
 0.6228623390197754,
 0.626957893371582,
 0.6369772553443909,
 0.6422826051712036,
 0.6729357838630676,
 0.6762813329696655,
 0.6857693195343018,
 0.6875126957893372,
 0.6891958713531494,
 0.6954690217971802,
 0.6958649158477783,
 0.7001152634620667,
 0.7024365663528442,
 0.7062240839004517,
 0.7068288326263428,
 0.7129759192466736,
 0.7140562534332275,
 0.7153698205947876,
 0.7158776521682739,
 0.7169833183288574,
 0.7188401222229004,
 0.7192863821983337,
 0.720025897026062,
 0.7236398458480835,
 0.7294028401374817,
 0.729867696762085,
 0.734695315361023,
 0.7349694967269897,
 0.7357739210128784,
 0.7367528676986694,
 0.7374358177185059,
 0.7374869585037231,
 0.7378900647163391,
 0.7379388809204102,
 0.738787829875946,
 0.7391879558563232,
 0.7400631904602051,
 0.7401241660118103,
 0.7404762506484985,
 0.7405154705047607,
 0.7406343221664429,
 0.7413570880889893,
 0.7414391040802002,
 0.7417921423912048

In [50]:
cosine_similarities_with_random_masking[450]

0.852277398109436

In [51]:
shuffled_cosine_similarities = [x.item() for x in shuffled_cosine_similarities]
shuffled_cosine_similarities.sort()
shuffled_cosine_similarities

[-0.0046724118292331696,
 0.0006882473826408386,
 0.003133371938019991,
 0.014870870858430862,
 0.016709811985492706,
 0.018163159489631653,
 0.021043090149760246,
 0.0220728050917387,
 0.02256518043577671,
 0.024797651916742325,
 0.024928126484155655,
 0.02961910516023636,
 0.029928024858236313,
 0.030505483970046043,
 0.03128442540764809,
 0.040265560150146484,
 0.04036771506071091,
 0.045471761375665665,
 0.0462975949048996,
 0.04647079110145569,
 0.048790544271469116,
 0.04959313943982124,
 0.04959947243332863,
 0.05030694976449013,
 0.0520712248980999,
 0.05359286814928055,
 0.05495494604110718,
 0.056084491312503815,
 0.05614056810736656,
 0.05678083747625351,
 0.0575520396232605,
 0.05818532407283783,
 0.05874546617269516,
 0.05957729369401932,
 0.060245364904403687,
 0.06025329977273941,
 0.06046969071030617,
 0.06058681011199951,
 0.060592249035835266,
 0.060786403715610504,
 0.06163368746638298,
 0.0618315227329731,
 0.0624222531914711,
 0.06255381554365158,
 0.06328226625919

In [52]:
# 0.8 as 50/50
# def classify(x): return 1 / (1 + math.exp(-20*(x - 0.8)))
# 0.9 is 88% probability
# 0.45 is very close to 0% probability

In [61]:
import torch.nn as nn

class EmbeddingClassifier(nn.Module):
    def __init__(self, embedding_imitator, sigmoid_center, sigmoid_coefficient):
        super(EmbeddingClassifier, self).__init__()
        self.embedding_imitator = embedding_imitator
        self.sigmoid_center = sigmoid_center
        self.sigmoid_coefficient = sigmoid_coefficient
    def forward(self, input_ids, attention_masks, real_ids):
        estimated_embedding = self.embedding_imitator(input_ids, attention_masks)
        similarity = F.cosine_similarity(estimated_embedding, real_ids, axis = -1)
        return torch.sigmoid(self.sigmoid_coefficient * (similarity - self.sigmoid_center))

In [62]:
classifier = EmbeddingClassifier(ckpt_model, 0.8, 20)
classifier.to(device)

EmbeddingClassifier(
  (embedding_imitator): EmbeddingImitator(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 512, padding_idx=0)
        (position_embeddings): Embedding(512, 512)
        (token_type_embeddings): Embedding(2, 512)
        (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-7): 8 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=512, out_features=512, bias=True)
                (key): Linear(in_features=512, out_features=512, bias=True)
                (value): Linear(in_features=512, out_features=512, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=512, out_features=

In [59]:
d = torch.load('vec2text/encoder_train.pth')
dev_dataloader_for_classifying = create_dataloaders(d['input_ids'][:1000], d['attention_mask'][:1000], d['y'][:1000], 32)

In [63]:
classifier.eval()
total_prediction, num_steps = 0, 0
predictions = []
for step, batch in enumerate(dev_dataloader_for_classifying):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    with torch.no_grad():
        outputs = classifier(batch_inputs, batch_masks, batch_labels)
        outputs_list = outputs.tolist()
        num_steps += 1
        predictions += outputs_list
        total_prediction += sum(outputs_list)
# print(f"Average loss is {total_loss_with_random_shuffling / num_steps}, average cosine similarity is {total_cosine_similarity_with_random_shuffling / num_steps}")
classifier.train()
print("Ok")

Ok


In [67]:
total_prediction / len(predictions) # (num_steps * 32)

0.8749686605334281

In [70]:
classifier.eval()
total_prediction_with_masking, num_steps = 0, 0
predictions_with_masking = []
could_be_last_batch = True
for step, batch in enumerate(dev_dataloader_for_classifying):  
    batch_inputs, batch_masks, batch_labels = \
                       tuple(b.to(device) for b in batch)
    assert ((len(batch_masks) == 32) or could_be_last_batch)
    if len(batch_masks) < 32:
        could_be_last_batch = False
    for j in range(len(batch_masks)):
        for i in range(len(batch_masks[j])):
            x = random.randint(1, 100)
            if x <= 15:
                batch_masks[j][i] = 0
    with torch.no_grad():
        outputs = classifier(batch_inputs, batch_masks, batch_labels)
        outputs_list = outputs.tolist()
        num_steps += 1
        predictions_with_masking += outputs_list
        total_prediction_with_masking += sum(outputs_list)
# print(f"Average loss is {total_loss_with_random_shuffling / num_steps}, average cosine similarity is {total_cosine_similarity_with_random_shuffling / num_steps}")
classifier.train()
print("Ok")

Ok


In [72]:
total_prediction_with_masking / len(predictions_with_masking) # (num_steps * 32)

0.6944783725179732