In [1]:
!pip install transformers
!pip install sentencepiece
!pip install torchmetrics
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import json
from functools import partial

from transformers import XLMRobertaTokenizer, XLMRobertaModel
#from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.optim import AdamW
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

import torchmetrics
import pytorch_lightning as pl

In [3]:
#tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
#encoder = AutoModel.from_pretrained('distilbert-base-cased')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')
encoder

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): 

In [4]:
TRAIN_BATCH_SIZE = 2
DEV_BATCH_SIZE = 8
ACCUMULATE_GRAD = 8

#Model

In [5]:
class SimilarityRegressor(nn.Module):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(SimilarityRegressor, self).__init__()

        self.encoder = encoder
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.linear1 = nn.Linear(self.embed_size, self.hidden_size)
        self.activation1 = nn.LeakyReLU(negative_slope=0.1)
        self.dropout1 = nn.Dropout(p=0.2)
        # self.linear2 = nn.Linear(2*self.hidden_size, self.hidden_size//3)
        # self.dropout2 = nn.Dropout(p=0.2)
        # self.activation2 = nn.LeakyReLU(negative_slope=0.1)
        self.linear3 = nn.Linear(2*self.hidden_size, 1)
        self.activation3 = nn.Sigmoid()

    def common_compute(self, x):
        x = self.encoder(**x).pooler_output
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.dropout1(x)

        return x
    
    def forward(self, x1, x2):
        x1 = self.common_compute(x1)
        x2 = self.common_compute(x2)
        x = torch.cat([torch.abs(x1 - x2), (x1 + x2)], dim=-1)
        # x = self.linear2(x)
        # x = self.activation2(x)
        # x = self.dropout2(x)
        x = self.linear3(x)
        x = 3*self.activation3(x) + 1

        return x

In [6]:
class LitSimilarityRegressor(pl.LightningModule):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(LitSimilarityRegressor, self).__init__()
        self.model = SimilarityRegressor(encoder, embed_size=embed_size, hidden_size=hidden_size)

        self.train_loss = torchmetrics.MeanMetric(compute_on_step=True)
        self.dev_loss = torchmetrics.MeanMetric(compute_on_step=False)


    def forward(self, x1, x2):
        return self.model(x1, x2)

    def configure_optimizers(self):
        #return AdamW(self.parameters(), lr=5e-6, betas=(0.9, 0.99), eps=1e-8, weight_decay=0.01)
        return AdamW(self.parameters())

    def training_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def validation_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def predict_step(self, batch, batch_idx):
        x1, x2, _ = batch
        output = self(x1, x2)

        return {'preds': output}

    def training_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.log('train/step/loss', self.train_loss(loss))
        #self.log('train/step/mape', self.train_mape(preds, target))
        #self.log('train/step/pcc', self.train_pcc(preds, target))

    def validation_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.dev_loss(loss)
        #self.dev_mape(preds, target)
        #self.dev_pcc(preds, target)

    '''
    def training_epoch_end(self, outs):
        self.log('train/epoch/loss', self.train_loss)
        self.log('train/epoch/mape', self.train_mape)
        self.log('train/epoch/pcc', self.train_pcc)

    def validation_epoch_end(self, outs):
        self.log('dev/loss', self.dev_loss)
        self.log('dev/mape', self.dev_mape)
        self.log('dev/pcc', self.dev_pcc)
    '''

In [7]:
model = LitSimilarityRegressor(encoder, embed_size=768, hidden_size=256)
model

LitSimilarityRegressor(
  (model): SimilarityRegressor(
    (encoder): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(i

#Data

In [8]:
#Link google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
root_path = 'gdrive/My Drive/IIIT/Sem1/IRE/Major_project/'
data = json.load(open(root_path+'train_data.json', 'r'))

In [10]:
data[0]

{'pair_id': '1484084337_1484110209',
 'title_1': 'Virginia man arrested in fatal DUI crash in West Virginia',
 'title_2': 'Haiti’s leader marks independence day amid security concerns',
 'text_1': 'MARTINSBURG, W.Va. — A suspected drunken driver was arrested in a New Year’s Day highway crash that killed another motorist in West Virginia, police said.\n\nThe accident occurred early Wednesday along southbound Interstate 81 in Berkeley County, The Journal reported.\n\nCody Wade Braithwaite, 32, of Winchester, Virginia, was charged with DUI with death, fleeing DUI, and being a prohibited person in possession of a firearm and a fugitive from justice, Berkeley County Sheriff Curtis Keller said.\n\nKeller said a deputy investigating calls about a reckless driver attempted to make a traffic stop, but the suspect’s vehicle sped up and hit a car, which then struck a guardrail. The driver of the car was pronounced dead at the scene. The driver’s name was not immediately released.\n\nThe suspect’s

In [11]:
train_dataset = data[:256]
test_dataset = data[256:320]

In [12]:
def collate_fn(batch, tokenizer):
    texts_1, texts_2, scores = list(), list(), list()
    for sample in batch:
        '''
        text1 = str(sample['ner_1']).lower().strip() + \
                str(sample['title_1']).lower().strip() + str(sample['meta_description_1']).lower().strip() + \
            ' '.join(sample['meta_keywords_1']).lower().strip() + ' '.join(sample['tags_1']).lower().strip()
        text2 = str(sample['ner_2']).lower().strip() + \
                str(sample['title_2']).lower().strip() + str(sample['meta_description_2']).lower().strip() + \
            ' '.join(sample['meta_keywords_2']).lower().strip() + ' '.join(sample['tags_2']).lower().strip()
        '''
        text1 = str(sample['text_1']).lower().strip()
        text2 = str(sample['text_2']).lower().strip()
        
        score = torch.tensor([sample['score']])
        texts_1.append(text1)
        texts_2.append(text2)
        scores.append(score)

    texts_1 = tokenizer(texts_1, truncation=True, padding=True, return_tensors='pt')
    texts_2 = tokenizer(texts_2, truncation=True, padding=True, return_tensors='pt')
    scores = torch.cat(scores, dim=0).unsqueeze(1)

    return texts_1, texts_2, scores

In [13]:
collate_partial = partial(collate_fn, tokenizer=tokenizer)
dataloader = DataLoader(test_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, collate_fn=collate_partial)
next(iter(dataloader))

({'input_ids': tensor([[     0,      6,   2980,  ..., 101120,      7,      2],
         [     0,     15, 147797,  ...,    111,     70,      2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]])},
 {'input_ids': tensor([[     0,   5646,     10,  ...,    206,    456,      2],
         [     0,   2684, 103036,  ...,   7440,  44951,      2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]])},
 tensor([[4.],
         [3.]]))

In [14]:
class MLNSDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, dev_dataset, test_dataset, train_batch_size, dev_batch_size, collate_fn, tokenizer):
        super(MLNSDataModule, self).__init__()
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.train_batch_size = train_batch_size
        self.dev_batch_size = dev_batch_size
        self.collate_fn = collate_fn
        self.tokenizer = tokenizer

    def train_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.train_dataset, shuffle=True, batch_size=self.train_batch_size, collate_fn=collate_partial)

    def val_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.dev_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def test_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.test_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def predict_dataloader(self):
        #return DataLoader(self.test_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)
        return self.test_dataloader()

In [15]:
#data_module = MLNSDataModule(train_dataset, dev_dataset, test_dataset, TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, collate_fn=collate_fn, tokenizer=tokenizer)
data_module = MLNSDataModule(train_dataset, train_dataset, test_dataset, TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, collate_fn=collate_fn, tokenizer=tokenizer)
data_module

<__main__.MLNSDataModule at 0x7fa83ac6ef90>

#Training

In [16]:
trainer = pl.Trainer(max_epochs=2,
                     accumulate_grad_batches=ACCUMULATE_GRAD,
                     accelerator='gpu')
                    

trainer

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


<pytorch_lightning.trainer.trainer.Trainer at 0x7fa83b37af10>

In [17]:
trainer.fit(model, datamodule=data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type                | Params
---------------------------------------------------
0 | model      | SimilarityRegressor | 278 M 
1 | train_loss | MeanMetric          | 0     
2 | dev_loss   | MeanMetric          | 0     
---------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
1,112.964 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [18]:
test_pred = trainer.predict(datamodule=data_module)
len(test_pred)

  + f" You can pass `.{fn}(ckpt_path='best')` to use the best model or"
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_2/checkpoints/epoch=1-step=32.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from checkpoint at /content/lightning_logs/version_2/checkpoints/epoch=1-step=32.ckpt


Predicting: 128it [00:00, ?it/s]

8

In [19]:
all_outputs = list()
for batch_outputs in test_pred:
    all_outputs.append(batch_outputs['preds'])
all_outputs = torch.cat(all_outputs, dim=0)

all_outputs.shape

torch.Size([64, 1])