In [1]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers
import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'

In [2]:
from functools import partial

import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import torchmetrics

from transformers import XLMRobertaTokenizer, XLMRobertaModel

In [3]:
!scp sagarsj42@ada:/share1/sagarsj42/semeval-2022-task-8-train-df.csv .
!scp sagarsj42@ada:/share1/sagarsj42/semeval-2022-task-8-eval-df.csv .

semeval-2022-task-8-train-df.csv              100%   27MB  27.3MB/s   00:00    
semeval-2022-task-8-eval-df.csv               100%   27MB  27.4MB/s   00:00    


In [4]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
N_GPUS = torch.cuda.device_count()

DEVICE, N_GPUS

(device(type='cuda'), 1)

In [5]:
TRAIN_BATCH_SIZE = 2
DEV_BATCH_SIZE = 8

In [6]:
class SimilarityClassifier(nn.Module):
    def __init__(self, encoder, embed_size=768, hidden_size=512, n_classes=4):
        super(SimilarityClassifier, self).__init__()

        self.encoder = encoder
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        
        self.linear1 = nn.Linear(self.embed_size, self.hidden_size)
        self.activation1 = nn.LeakyReLU(negative_slope=0.1)
        self.linear2 = nn.Linear(self.hidden_size, self.n_classes)
        self.activation2 = nn.Softmax(dim=-1)

    def common_compute(self, x):
        x = self.encoder(**x).pooler_output
        x = self.linear1(x)
        x = self.activation1(x)

        return x
    
    def forward(self, x1, x2):
        x1 = self.common_compute(x1)
        x2 = self.common_compute(x2)
        x = torch.abs(x1 - x2)
        x = self.linear2(x)
        x = self.activation2(x)

        return x

In [7]:
class MultilingualNewsSimDataset(Dataset):
    def __init__(self, df):
        super(MultilingualNewsSimDataset, self).__init__()
        self.df = df

    def __getitem__(self, idx):
        return self.df.iloc[idx][['pair_id', 'text_1', 'text_2', 'score']].to_dict()
        
    def __len__(self):
        return self.df.shape[0]

In [8]:
def collate_fn(batch, tokenizer, num_classes=4):
    texts_1, texts_2, scores = list(), list(), list()
    for sample in batch:
        text1 = str(sample['text_1']).lower().strip()
        text2 = str(sample['text_2']).lower().strip()
        
        score_indx = round(sample['score']) - 1
        score = torch.zeros((1, num_classes))
        score[0, score_indx] = 1
        
        texts_1.append(text1)
        texts_2.append(text2)
        scores.append(score)

    texts_1 = tokenizer(texts_1, truncation=True, padding=True, return_tensors='pt')
    texts_2 = tokenizer(texts_2, truncation=True, padding=True, return_tensors='pt')
    scores = torch.cat(scores, dim=0)

    return texts_1, texts_2, scores

In [9]:
class MLNSDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, dev_dataset, test_dataset, train_batch_size, dev_batch_size, collate_fn, tokenizer):
        super(MLNSDataModule, self).__init__()
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.train_batch_size = train_batch_size
        self.dev_batch_size = dev_batch_size
        self.collate_fn = collate_fn
        self.tokenizer = tokenizer

    def train_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer, num_classes=4)
        return DataLoader(self.train_dataset, shuffle=True, batch_size=self.train_batch_size, collate_fn=collate_partial)

    def val_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer, num_classes=4)
        return DataLoader(self.dev_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def test_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer, num_classes=4)
        return DataLoader(self.test_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def predict_dataloader(self):
        return self.test_dataloader()

In [10]:
class LitSimilarityClassifier(pl.LightningModule):
    def __init__(self, encoder, embed_size=768, hidden_size=512, n_classes=4):
        super(LitSimilarityClassifier, self).__init__()
        self.model = SimilarityClassifier(encoder, embed_size=embed_size, hidden_size=hidden_size, n_classes=n_classes)

        self.train_loss = torchmetrics.MeanMetric(compute_on_step=True)
        self.dev_loss = torchmetrics.MeanMetric(compute_on_step=False)

        self.train_acc = torchmetrics.Accuracy(num_classes=self.model.n_classes, threshold=0.5, average='micro', compute_on_step=True)
        self.dev_acc = torchmetrics.Accuracy(num_classes=self.model.n_classes, threshold=0.5, average='micro', compute_on_step=False)

    def forward(self, x1, x2):
        return self.model(x1, x2)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5, betas=(0.9, 0.99), eps=1e-8, weight_decay=0.01)

    def training_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.cross_entropy(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores.long()}

    def validation_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.cross_entropy(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores.long()}

    def predict_step(self, batch, batch_idx):
        x1, x2, _ = batch
        output = self(x1, x2)

        return {'preds': output}

    def training_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.log('train/step/loss', self.train_loss(loss))
        self.log('train/step/acc', self.train_acc(preds, target))

    def validation_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.dev_loss(loss)
        self.dev_acc(preds, target)

    def training_epoch_end(self, outs):
        self.log('train/epoch/loss', self.train_loss)
        self.log('train/epoch/acc', self.train_acc)

    def validation_epoch_end(self, outs):
        self.log('dev/loss', self.dev_loss)
        self.log('dev/acc', self.dev_acc)

In [11]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')
encoder

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): 

In [12]:
train_df = pd.read_csv(os.path.join(os.getcwd(), 'semeval8-train-sss.csv'), index_col=0)
dev_df = pd.read_csv('semeval8-dev.csv', index_col=0)
test_df = pd.read_csv('semeval-2022-task-8-eval-df.csv', index_col=0)

train_df.shape, dev_df.shape, test_df.shape

((4467, 20), (497, 20), (4953, 20))

In [13]:
train_dataset = MultilingualNewsSimDataset(train_df)
dev_dataset = MultilingualNewsSimDataset(dev_df)
test_dataset = MultilingualNewsSimDataset(test_df)

len(train_dataset), len(dev_dataset), len(test_dataset)

(4467, 497, 4953)

In [14]:
dl = DataLoader(train_dataset, batch_size=2, collate_fn=partial(collate_fn, tokenizer=tokenizer, num_classes=4))
x1, x2, y = next(iter(dl))
x1.input_ids.shape, x1.attention_mask.shape, x2.input_ids.shape, x2.attention_mask.shape, y.shape

(torch.Size([2, 512]),
 torch.Size([2, 512]),
 torch.Size([2, 512]),
 torch.Size([2, 512]),
 torch.Size([2, 4]))

In [15]:
data_module = MLNSDataModule(train_dataset, dev_dataset, test_dataset, TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, collate_fn=collate_fn, tokenizer=tokenizer)
data_module

<__main__.MLNSDataModule at 0x7fafeb25f9a0>

In [16]:
model = LitSimilarityClassifier(encoder, embed_size=768, hidden_size=512, n_classes=4)
model

LitSimilarityClassifier(
  (model): SimilarityClassifier(
    (encoder): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear

In [17]:
exp_name = 'mlns-classifier-2'
logger = pl.loggers.WandbLogger(save_dir=exp_name, project=exp_name, log_model=False)

In [18]:
checkpoint_callback = ModelCheckpoint(
    dirpath=exp_name,
    filename='{epoch}-{step}',
    monitor='dev/acc',
    mode='max',
    # save_top_k=1,
    verbose=True,
    save_last=True,
    save_weights_only=False,
    every_n_epochs=1
)

In [19]:
trainer = pl.Trainer(
    max_epochs=5,
    accumulate_grad_batches=8,
    accelerator='gpu',
    gpus=N_GPUS,
    # overfit_batches=10,
    check_val_every_n_epoch=1, val_check_interval=0.25,
    log_every_n_steps=2, enable_progress_bar=True,
    gradient_clip_val=0.25, track_grad_norm=2,
    enable_checkpointing=True,
    callbacks=[checkpoint_callback],
    logger=logger,
    enable_model_summary=True
)

trainer

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


<pytorch_lightning.trainer.trainer.Trainer at 0x7fafeb26f4c0>

In [20]:
trainer.fit(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                 | Params
----------------------------------------------------
0 | model      | SimilarityClassifier | 278 M 
1 | train_loss | MeanMetric           | 0     
2 | dev_loss   | MeanMetric           | 0     
3 | train_acc  | Accuracy             | 0     
4 | dev_acc    | Accuracy             | 0     
----------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
1,113.758 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(


Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 69: dev/acc reached 0.75000 (best 0.75000), saving model to "/scratch/sagarsj42/mlns-classifier-2/epoch=0-step=69.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 139: dev/acc reached 0.75201 (best 0.75201), saving model to "/scratch/sagarsj42/mlns-classifier-2/epoch=0-step=139.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 209: dev/acc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 278: dev/acc reached 0.75503 (best 0.75503), saving model to "/scratch/sagarsj42/mlns-classifier-2/epoch=0-step=278.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 349: dev/acc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 419: dev/acc reached 0.75905 (best 0.75905), saving model to "/scratch/sagarsj42/mlns-classifier-2/epoch=1-step=419.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 489: dev/acc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 558: dev/acc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 629: dev/acc was not in top 1
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [22]:
pred_output = trainer.predict(model, datamodule=data_module, ckpt_path=os.path.join(exp_name, 'epoch=1-step=419.ckpt'))
len(pred_output)

  rank_zero_deprecation(
Restoring states from the checkpoint path at mlns-classifier-2/epoch=1-step=419.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at mlns-classifier-2/epoch=1-step=419.ckpt
  rank_zero_warn(


Predicting: 754it [00:00, ?it/s]

620

In [23]:
all_outputs = list()
for batch_outputs in pred_output:
    all_outputs.append(batch_outputs['preds'])
all_outputs = torch.cat(all_outputs, dim=0)

all_outputs.shape

torch.Size([4953, 4])

In [24]:
for p in all_outputs.argmax(dim=1):
    print(p)

tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(3)
tensor(2)
tensor(3)
tensor(3)
tensor(2)


In [35]:
pd.DataFrame(list(zip(test_df.pair_id, (all_outputs.argmax(1) + 1).tolist())), columns=['pair_id', 'Overall']).to_csv('test-results.csv')