In [2]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers
import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'

In [18]:
import json
from functools import partial

import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import torchmetrics

from transformers import XLMRobertaTokenizer, XLMRobertaModel

In [6]:
!scp sagarsj42@ada:/share1/sagarsj42/semeval8-train-sss.csv .
!scp sagarsj42@ada:/share1/sagarsj42/semeval8-dev.csv .
!scp sagarsj42@ada:/share1/sagarsj42/semeval-2022-task-8-eval-df.csv .
!scp sagarsj42@ada:/share1/sagarsj42/semeval8-train-ner.json .
!scp sagarsj42@ada:/share1/sagarsj42/semeval8-dev-ner.json .
!scp sagarsj42@ada:/share1/sagarsj42/semeval8-test-ner.json .

semeval8-train-sss.csv                        100%   25MB  24.5MB/s   00:01    
semeval8-dev.csv                              100% 2825KB   2.8MB/s   00:00    
semeval-2022-task-8-eval-df.csv               100%   27MB  27.4MB/s   00:00    
semeval8-train-ner.json                       100% 2714KB   2.7MB/s   00:00    
semeval8-dev-ner.json                         100%  312KB 312.5KB/s   00:00    
semeval8-test-ner.json                        100% 3393KB   3.3MB/s   00:00    


In [7]:
EXP_NAME = 'mlns-ner-title-xlmr-regressor'
TRAIN_BATCH_SIZE = 2
DEV_BATCH_SIZE = 8
ACCUMULATE_GRAD = 8

In [8]:
class SimilarityRegressor(nn.Module):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(SimilarityRegressor, self).__init__()

        self.encoder = encoder
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.linear1 = nn.Linear(self.embed_size, self.hidden_size)
        self.activation1 = nn.LeakyReLU(negative_slope=0.1)
        self.dropout1 = nn.Dropout(p=0.2)
        # self.linear2 = nn.Linear(2*self.hidden_size, self.hidden_size//3)
        # self.dropout2 = nn.Dropout(p=0.2)
        # self.activation2 = nn.LeakyReLU(negative_slope=0.1)
        self.linear3 = nn.Linear(3*self.hidden_size, 1)
        self.activation3 = nn.Sigmoid()

    def common_compute(self, x):
        x = self.encoder(**x).pooler_output
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.dropout1(x)

        return x
    
    def forward(self, x1, x2):
        x1 = self.common_compute(x1)
        x2 = self.common_compute(x2)
        x = torch.cat([torch.abs(x1 - x2), (x1 + x2)], dim=-1)
        # x = self.linear2(x)
        # x = self.activation2(x)
        # x = self.dropout2(x)
        x = self.linear3(x)
        x = 3*self.activation3(x) + 1

        return x

In [9]:
class LitSimilarityRegressor(pl.LightningModule):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(LitSimilarityRegressor, self).__init__()
        self.model = SimilarityRegressor(encoder, embed_size=embed_size, hidden_size=hidden_size)

        self.train_loss = torchmetrics.MeanMetric(compute_on_step=True)
        self.dev_loss = torchmetrics.MeanMetric(compute_on_step=False)

        self.train_mape = torchmetrics.MeanAbsolutePercentageError(compute_on_step=True)
        self.dev_mape = torchmetrics.MeanAbsolutePercentageError(compute_on_step=False)

        self.train_pcc = torchmetrics.PearsonCorrCoef(compute_on_step=True)
        self.dev_pcc = torchmetrics.PearsonCorrCoef(compute_on_step=False)

    def forward(self, x1, x2):
        return self.model(x1, x2)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=5e-6, betas=(0.9, 0.99), eps=1e-8, weight_decay=0.01)

    def training_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def validation_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def predict_step(self, batch, batch_idx):
        x1, x2, _ = batch
        output = self(x1, x2)

        return {'preds': output}

    def training_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.log('train/step/loss', self.train_loss(loss))
        self.log('train/step/mape', self.train_mape(preds, target))
        self.log('train/step/pcc', self.train_pcc(preds, target))

    def validation_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.dev_loss(loss)
        self.dev_mape(preds, target)
        self.dev_pcc(preds, target)

    def training_epoch_end(self, outs):
        self.log('train/epoch/loss', self.train_loss)
        self.log('train/epoch/mape', self.train_mape)
        self.log('train/epoch/pcc', self.train_pcc)

    def validation_epoch_end(self, outs):
        self.log('dev/loss', self.dev_loss)
        self.log('dev/mape', self.dev_mape)
        self.log('dev/pcc', self.dev_pcc)

In [12]:
class MultilingualNewsSimDataset(Dataset):
    def __init__(self, df, ner):
        super(MultilingualNewsSimDataset, self).__init__()
        self.df = df
        self.ner = ner

    def __getitem__(self, idx):
        pair_id = self.df.iloc[idx]['pair_id']
        ners = self.ner[pair_id]
        sample_dict =  self.df.iloc[idx][['pair_id', 'title_1', 'meta_description_1', 'meta_keywords_1', 'tags_1', 
                                          'title_2', 'meta_description_2', 'meta_keywords_2', 'tags_2', 
                                          'score']].to_dict()
        sample_dict['ner_1'] = ners[0]
        sample_dict['ner_2'] = ners[1]
        
        return sample_dict
        
    def __len__(self):
        return self.df.shape[0]

In [13]:
def collate_fn(batch, tokenizer):
    texts_1, texts_2, scores = list(), list(), list()
    for sample in batch:
        text1 = str(sample['ner_1']).lower().strip() + \
                str(sample['title_1']).lower().strip() + str(sample['meta_description_1']).lower().strip() + \
            ' '.join(sample['meta_keywords_1']).lower().strip() + ' '.join(sample['tags_1']).lower().strip()
        text2 = str(sample['ner_2']).lower().strip() + \
                str(sample['title_2']).lower().strip() + str(sample['meta_description_2']).lower().strip() + \
            ' '.join(sample['meta_keywords_2']).lower().strip() + ' '.join(sample['tags_2']).lower().strip()
        
        score = torch.tensor([sample['score']])
        texts_1.append(text1)
        texts_2.append(text2)
        scores.append(score)

    texts_1 = tokenizer(texts_1, truncation=True, padding=True, return_tensors='pt')
    texts_2 = tokenizer(texts_2, truncation=True, padding=True, return_tensors='pt')
    scores = torch.cat(scores, dim=0).unsqueeze(1)

    return texts_1, texts_2, scores

In [14]:
class MLNSDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, dev_dataset, test_dataset, train_batch_size, dev_batch_size, collate_fn, tokenizer):
        super(MLNSDataModule, self).__init__()
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.train_batch_size = train_batch_size
        self.dev_batch_size = dev_batch_size
        self.collate_fn = collate_fn
        self.tokenizer = tokenizer

    def train_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.train_dataset, shuffle=True, batch_size=self.train_batch_size, collate_fn=collate_partial)

    def val_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.dev_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def test_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.test_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def predict_dataloader(self):
        return self.test_dataloader()

In [15]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')
encoder

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): 

In [16]:
train_df = pd.read_csv(os.path.join(os.getcwd(), 'semeval8-train-sss.csv'), index_col=0)
dev_df = pd.read_csv('semeval8-dev.csv', index_col=0)
test_df = pd.read_csv('semeval-2022-task-8-eval-df.csv', index_col=0)

train_df.shape, dev_df.shape, test_df.shape

((4467, 20), (497, 20), (4953, 20))

In [20]:
with open('semeval8-train-ner.json', 'r') as f:
    train_ner = json.load(f)
with open('semeval8-dev-ner.json', 'r') as f:
    dev_ner = json.load(f)
with open('semeval8-test-ner.json', 'r') as f:
    test_ner = json.load(f)
    
len(list(train_ner.keys())), len(list(dev_ner.keys())), len(list(test_ner.keys()))

(4430, 496, 4902)

In [22]:
train_dataset = MultilingualNewsSimDataset(train_df, train_ner)
dev_dataset = MultilingualNewsSimDataset(dev_df, dev_ner)
test_dataset = MultilingualNewsSimDataset(test_df, test_ner)

len(train_dataset), len(dev_dataset), len(test_dataset)

(4467, 497, 4953)

In [23]:
train_dataset[4]

{'pair_id': '1605429954_1591825952',
 'title_1': 'Parlament in Winterthur – Teure Ratsdebatten in der Eulachhalle',
 'meta_description_1': 'Das Coronavirus schickte den Grossen Gemeinderat für drei Monate in die Zwangspause. Die erste Sitzung nach dem Lockdown findet nun am 25. Mai in den Eulachhallen statt. Auch das Krisenmanagement der Stadt steht zur Debatte.',
 'meta_keywords_1': "['']",
 'tags_1': '[]',
 'title_2': 'Coronavirus and Your Supply Chain',
 'meta_description_2': nan,
 'meta_keywords_2': "['']",
 'tags_2': '[]',
 'score': 4.0,
 'ner_1': 'Geering Fast Februar 30 Parlament Monate Uhr Exekutive Bachmann Eine am 24 CVP Andreas drei April 25 erste Lopardo Winterthur dem Mai Delia Enzo Eulachhalle Bundesrat Woche letzte ersten Corona Eulachhallen',
 'ner_2': 'EDT Up America Brief 30 Bank PDT ISM Chain Policy University State 2020 9 Supply one NC Handfield Service Logistics April of Management 28th and Rob 12 AM Privacy Newsletter PM Institute the Terms Google 5'}

In [25]:
collate_partial = partial(collate_fn, tokenizer=tokenizer)
dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, collate_fn=collate_partial)
next(iter(dataloader))

({'input_ids': tensor([[     0,   2510,    702,  43606,   3017,  68321,    382, 168766,  13452,
               6, 155103,  13673,    136,   1544,    372,   6650,     83,   1577,
              40,   5974,    685,  89753,   1482,  17399,  76875,    111, 113742,
              19, 102097,   6077,     25,      7,  50960,   1543,    209,     70,
          136659,  65678,  10363,     53,   2445,    876,  65037,     70,  19896,
              14,   3378,  10176, 122084,  90695,      7,  68636,   1363,  12430,
             254,  12620,  19110,     53,    951,     53,   1286, 109270,     25,
               7,  59671,    387, 113742,    138,  10740,    714,   9199,   6602,
              10,  72428,  20930,  13696,  17262,  73715,    939,  19896,   1573,
              25,      7,     10,   4188,   9118, 127067,    106,    116,  10332,
           56692,  22473,   4745, 127067,     13,   4438,   4568,   5155,      6,
          153109,   6366,  11762, 143257,  25248,    208,  39719,   3413, 115252,
  

In [26]:
data_module = MLNSDataModule(train_dataset, dev_dataset, test_dataset, TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, collate_fn=collate_fn, tokenizer=tokenizer)
data_module

<__main__.MLNSDataModule at 0x7fbfce07d310>

In [27]:
model = LitSimilarityRegressor(encoder, embed_size=768, hidden_size=256)
model

LitSimilarityRegressor(
  (model): SimilarityRegressor(
    (encoder): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(i

In [28]:
logger = pl.loggers.WandbLogger(save_dir=EXP_NAME, project=EXP_NAME, log_model=False)
logger

<pytorch_lightning.loggers.wandb.WandbLogger at 0x7fbfbf4c1340>

In [29]:
checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(EXP_NAME, 'lightning-checkpoints'),
    filename='{epoch}-{step}',
    monitor='dev/pcc',
    mode='max',
    save_top_k=5,
    verbose=True,
    save_last=True,
    save_weights_only=False,
    every_n_epochs=1
)

In [30]:
trainer = pl.Trainer(
    max_epochs=10,
    accumulate_grad_batches=ACCUMULATE_GRAD,
    accelerator='gpu',
    gpus=4,
    strategy='ddp',
    # overfit_batches=10,
    check_val_every_n_epoch=1, val_check_interval=0.25,
    log_every_n_steps=2, enable_progress_bar=True,
    gradient_clip_val=0.25, track_grad_norm=2,
    enable_checkpointing=True,
    callbacks=[checkpoint_callback],
    logger=logger,
    enable_model_summary=True
)

trainer

MisconfigurationException: `Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn. In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function.

In [31]:
trainer.fit(model, datamodule=data_module)

NameError: name 'trainer' is not defined

In [32]:
test_pred = trainer.predict(datamodule=data_module)
len(test_pred)

NameError: name 'trainer' is not defined

In [33]:
all_outputs = list()
for batch_outputs in test_pred:
    all_outputs.append(batch_outputs['preds'])
all_outputs = torch.cat(all_outputs, dim=0)

all_outputs.shape

NameError: name 'test_pred' is not defined

In [34]:
pd.DataFrame(list(zip(test_df.pair_id, all_outputs.squeeze(1).tolist())), columns=['pair_id', 
                                                   'Overall']).to_csv(EXP_NAME + '-test-results.csv')

AttributeError: 'list' object has no attribute 'squeeze'