In [1]:
import datasets

In [2]:
snli = datasets.load_dataset('snli',split="train")

In [3]:
snli

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 550152
})

In [4]:
mnli = datasets.load_dataset("glue","mnli",split="train")

In [5]:
mnli

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 392702
})

In [6]:
mnli = mnli.remove_columns(["idx"])

In [7]:
mnli

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 392702
})

In [8]:
snli = snli.cast(mnli.features)

In [9]:
dataset = datasets.concatenate_datasets([snli,mnli])


In [10]:
# # Shuffle the dataset
# dataset = dataset.shuffle(seed=42)

# # Calculate the number of rows to keep (half of the original dataset)
# num_rows_to_keep = len(dataset) // 2

# # Select the first half of the shuffled dataset
# dataset = dataset.select(list(range(num_rows_to_keep)))

# dataset

In [11]:
del snli, mnli

In [12]:
from transformers import BertTokenizer,BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [13]:
all_cols = ['label']
dataset = dataset.filter(
    lambda x:False if x['label']==-1 else True
)
print(len(dataset))
for part in ["premise","hypothesis"]:
    dataset = dataset.map(
        lambda x:tokenizer(
            x[part],max_length=128,padding="max_length",
            truncation=True
    ),batched=True
    )
    for col in ["input_ids","attention_mask"]:
        dataset=dataset.rename_column(
            col,part+'_'+col  
        )
        all_cols.append(part+'_'+col)
print(all_cols)

942069
['label', 'premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask']


In [14]:
dataset.set_format(type="torch",columns = all_cols)

In [15]:
import torch
batch_size = 16
loader = torch.utils.data.DataLoader(dataset,batch_size=batch_size)

In [10]:
def mean_pool(token_embeds,attention_mask):
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    pool = torch.sum(token_embeds*in_mask,1)/torch.clamp(
        in_mask.sum(1),min=1e-9
    )
    return pool

In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(f"moved to {device}")

moved to cuda


In [18]:
#define the layers to be used in classification
ffnn = torch.nn.Linear(768*3,3)
loss_func = torch.nn.CrossEntropyLoss()
#Move layers to device
ffnn.to(device)
loss_func.to(device)

CrossEntropyLoss()

In [19]:
torch.backends.cudnn.benchmark = True

In [20]:
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import AdamW

#Initialize Adam optimizer 
optim = AdamW(model.parameters(),lr = 1e-5)

epochs = 1
#setup warmup for first -10% steps
total_steps = int(len(dataset)/batch_size)
warmup_steps = int(0.1*total_steps*epochs)
scheduler = get_linear_schedule_with_warmup(
    optim,num_warmup_steps=warmup_steps,
    num_training_steps=total_steps-warmup_steps  
    )



In [21]:
from tqdm import tqdm

# 1 epoch should be enough, increase if wanted
for epoch in range(1):
    model.train()  # make sure model is in training mode
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # zero all gradients on each new step
        optim.zero_grad()
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['label'].to(device)
        # extract token embeddings from BERT
        u = model(
            inputs_ids_a, attention_mask=attention_a
        )[0]  # all token embeddings A
        v = model(
            inputs_ids_b, attention_mask=attention_b
        )[0]  # all token embeddings B
        # get the mean pooled vectors
        u = mean_pool(u, attention_a)
        v = mean_pool(v, attention_b)
        # build the |u-v| tensor
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)
        # concatenate u, v, |u-v|
        x = torch.cat([u, v, uv_abs], dim=-1)
        # process concatenated tensor through FFNN
        x = ffnn(x)
        # calculate the 'softmax-loss' between predicted and true label
        loss = loss_func(x, label)
        # using loss, calculate gradients and then optimize
        loss.backward()
        optim.step()
        # update learning rate scheduler
        scheduler.step()
        # update the TDQM progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0:   0%|                                                        | 22/58880 [01:27<64:42:36,  3.96s/it, loss=1.09]


KeyboardInterrupt: 

In [None]:
import os
model_path = './sbert_test_c'

if not os.path.exists(model_path):
    os.mkdir(model_path)
model.save_pretrained(model_path)

In [6]:
import torch
from transformers import BertModel,BertTokenizer
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(f"moved to {device}")

moved to cuda


In [7]:
model = BertModel.from_pretrained("./sbert_test_c")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [8]:
# model.save_to_hub(
#     "distilroberta-base-sentence-transformer", 
#     organization= "Shayaan69"
#     train_datasets=["embedding-data/QQP_triplets"],
#     )