In [1]:
!mkdir -p /scratch/dhaval.taunk/torch-cache
!mkdir -p /scratch/dhaval.taunk/transformers
import os
os.chdir('/scratch/dhaval.taunk')
os.environ['TORCH_HOME'] = '/scratch/dhaval.taunk/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/dhaval.taunk/transformers'

In [2]:
from functools import partial

import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import torchmetrics

from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel

In [3]:
# !echo 'yes' | scp dhaval.taunk@ada:/share1/dhaval.taunk/semeval8-train-sss.csv .
# !echo 'yes' | scp dhaval.taunk@ada:/share1/dhaval.taunk/semeval8-dev.csv .

In [4]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
N_GPUS = torch.cuda.device_count()

DEVICE, N_GPUS

(device(type='cuda'), 1)

In [5]:
EXP_NAME = 'mlns-distilbert-regressor_concat_3_not_linear'
TRAIN_BATCH_SIZE = 4
DEV_BATCH_SIZE = 8
ACCUMULATE_GRAD = 8

In [6]:
class SimilarityRegressor(nn.Module):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(SimilarityRegressor, self).__init__()

        self.encoder = encoder
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.linear1 = nn.Linear(self.embed_size, self.hidden_size)
        self.activation1 = nn.LeakyReLU(negative_slope=0.1)
        self.dropout1 = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(3*self.hidden_size, self.hidden_size//3)
        self.dropout2 = nn.Dropout(p=0.2)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(3*self.hidden_size, 1)
        self.activation3 = nn.Sigmoid()

    def common_compute(self, x):
        # print(self.encoder(**x)[0].shape)
        x = self.encoder(**x)[0][:, 0]
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.dropout1(x)

        return x
    
    def forward(self, x1, x2):
        x1 = self.common_compute(x1)
        x2 = self.common_compute(x2)
        x = torch.cat([x1, x2, torch.abs(x1 - x2)], dim=-1)
        # x = self.linear2(x)
        # x = self.activation2(x)
        # x = self.dropout2(x)
        x = self.linear3(x)
        x = 3*self.activation3(x) + 1

        return x

In [7]:
class LitSimilarityRegressor(pl.LightningModule):
    def __init__(self, encoder, embed_size=768, hidden_size=256):
        super(LitSimilarityRegressor, self).__init__()
        self.model = SimilarityRegressor(encoder, embed_size=embed_size, hidden_size=hidden_size)

        self.train_loss = torchmetrics.MeanMetric(compute_on_step=True)
        self.dev_loss = torchmetrics.MeanMetric(compute_on_step=False)

        self.train_mape = torchmetrics.MeanAbsolutePercentageError(compute_on_step=True)
        self.dev_mape = torchmetrics.MeanAbsolutePercentageError(compute_on_step=False)

        self.train_pcc = torchmetrics.PearsonCorrCoef(compute_on_step=True)
        self.dev_pcc = torchmetrics.PearsonCorrCoef(compute_on_step=False)

    def forward(self, x1, x2):
        return self.model(x1, x2)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5, betas=(0.9, 0.99), eps=1e-8, weight_decay=0.01)

    def training_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def validation_step(self, batch, batch_idx):
        x1, x2, scores = batch
        output = self(x1, x2)
        loss = F.mse_loss(input=output, target=scores)

        return {'loss': loss, 'preds': output, 'target': scores}

    def predict_step(self, batch, batch_idx):
        x1, x2, _ = batch
        output = self(x1, x2)

        return {'preds': output}

    def training_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.log('train/step/loss', self.train_loss(loss))
        self.log('train/step/mape', self.train_mape(preds, target))
        self.log('train/step/pcc', self.train_pcc(preds, target))

    def validation_step_end(self, outs):
        loss = outs['loss']
        preds = outs['preds']
        target = outs['target']

        self.dev_loss(loss)
        self.dev_mape(preds, target)
        self.dev_pcc(preds, target)

    def training_epoch_end(self, outs):
        self.log('train/epoch/loss', self.train_loss)
        self.log('train/epoch/mape', self.train_mape)
        self.log('train/epoch/pcc', self.train_pcc)

    def validation_epoch_end(self, outs):
        self.log('dev/loss', self.dev_loss)
        self.log('dev/mape', self.dev_mape)
        self.log('dev/pcc', self.dev_pcc)

In [8]:
class MultilingualNewsSimDataset(Dataset):
    def __init__(self, df):
        super(MultilingualNewsSimDataset, self).__init__()
        self.df = df

    def __getitem__(self, idx):
        return self.df.iloc[idx][['pair_id', 'meta_description_1', 'title_1', 'text_1', 'meta_description_2', 'title_2', 'text_2', 'score']].to_dict()
        
    def __len__(self):
        return self.df.shape[0]

In [9]:
def collate_fn(batch, tokenizer):
    texts_1, texts_2, scores = list(), list(), list()
    for sample in batch:
        text1 = str(sample['title_1']).lower().strip()+str(sample['meta_description_1']).lower().strip()#+str(sample['text_1']).lower().strip()
        text2 = str(sample['title_2']).lower().strip()+str(sample['meta_description_2']).lower().strip()#+str(sample['text_2']).lower().strip()

        score = torch.tensor([sample['score']])
        texts_1.append(text1)
        texts_2.append(text2)
        scores.append(score)

    texts_1 = tokenizer(texts_1, truncation=True, padding=True, return_tensors='pt')
    texts_2 = tokenizer(texts_2, truncation=True, padding=True, return_tensors='pt')
    scores = torch.cat(scores, dim=0).unsqueeze(1)

    return texts_1, texts_2, scores

In [10]:
class MLNSDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, dev_dataset, test_dataset, train_batch_size, dev_batch_size, collate_fn, tokenizer):
        super(MLNSDataModule, self).__init__()
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.train_batch_size = train_batch_size
        self.dev_batch_size = dev_batch_size
        self.collate_fn = collate_fn
        self.tokenizer = tokenizer

    def train_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.train_dataset, shuffle=True, batch_size=self.train_batch_size, collate_fn=collate_partial)

    def val_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.dev_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def test_dataloader(self):
        collate_partial = partial(self.collate_fn, tokenizer=self.tokenizer)
        return DataLoader(self.test_dataset, shuffle=False, batch_size=self.dev_batch_size, collate_fn=collate_partial)

    def predict_dataloader(self):
        return self.test_dataloader()

In [11]:
# tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
# encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

encoder = AutoModel.from_pretrained("distilbert-base-multilingual-cased")

encoder

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(

In [12]:
train_df = pd.read_csv('~/semeval/semeval8-train-sss.csv', index_col=0)
dev_df = pd.read_csv('~/semeval/semeval8-dev.csv', index_col=0)
test_df = pd.read_csv('~/semeval/semeval-2022-task-8-eval-df.csv', index_col=0)

train_df.shape, dev_df.shape, test_df.shape

((4467, 20), (497, 20), (4953, 20))

In [13]:
train_dataset = MultilingualNewsSimDataset(train_df)
dev_dataset = MultilingualNewsSimDataset(dev_df)
test_dataset = MultilingualNewsSimDataset(test_df)

len(train_dataset), len(dev_dataset), len(test_dataset)

(4467, 497, 4953)

In [14]:
collate_partial = partial(collate_fn, tokenizer=tokenizer)
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_partial)
batch = next(iter(dataloader))
batch

({'input_ids': tensor([[   101,  31206,  37715,  10251,  20637,    118,  11264,  19113,  17375,
             187,  10269,  61512,  10305,  34693,  12426,  10151,  67972,  10171,
           10790,  53771, 103099,    117,  10128,  10106,  64169,  15880, 108000,
           10115,  10166,  11170,  32194,    118,  10270,    118,  38607,  10136,
           10496,  13078,  52806,  53833,    119,  10242,  20637,  11264,  19113,
           17375,  23930,  12426,    187,  10269,  61512,  10305,  34693,  25919,
           10615,    119,  10128,  79601,  50640,  13238,  10136,  11566,  29956,
           41224,  10136,    119,    102,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
  

In [15]:
# model.device

In [16]:
# x1, x2, _ = batch
# outputs = model(x1.to(DEVICE), x2.to(DEVICE))
# outputs.shape

In [17]:
# outputs

In [18]:
data_module = MLNSDataModule(train_dataset, dev_dataset, test_dataset, TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, collate_fn=collate_fn, tokenizer=tokenizer)
data_module

<__main__.MLNSDataModule at 0x153075246430>

In [19]:
model = LitSimilarityRegressor(encoder, embed_size=768, hidden_size=512)


# model = LitSimilarityRegressor.load_from_checkpoint(encoder, )
model

LitSimilarityRegressor(
  (model): SimilarityRegressor(
    (encoder): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
     

In [20]:
logger = pl.loggers.WandbLogger(save_dir=EXP_NAME, project=EXP_NAME, log_model=False)
logger

<pytorch_lightning.loggers.wandb.WandbLogger at 0x15306cf7b940>

In [21]:
checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(EXP_NAME, 'lightning-checkpoints'),
    filename='{epoch}-{step}',
    # monitor='dev/pcc',
    # mode='max',
    save_top_k=-1,
    verbose=True,
    save_last=True,
    save_weights_only=False,
    every_n_epochs=1
)


ModelCheckpoint(save_last=True, save_top_k=-1, monitor=None) will duplicate the last checkpoint saved.


In [22]:
trainer = pl.Trainer(
    max_epochs=10,
    accumulate_grad_batches=ACCUMULATE_GRAD,
    accelerator='gpu',
    gpus=1,
    # overfit_batches=10,
    check_val_every_n_epoch=1, val_check_interval=0.25,
    log_every_n_steps=2, enable_progress_bar=True,
    gradient_clip_val=0.25, track_grad_norm=2,
    enable_checkpointing=True,
    callbacks=[checkpoint_callback],
    logger=logger,
    enable_model_summary=True
)
    
trainer

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


<pytorch_lightning.trainer.trainer.Trainer at 0x15306bb19dc0>

In [23]:
trainer.fit(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name       | Type                        | Params
-----------------------------------------------------------
0 | model      | SimilarityRegressor         | 135 M 
1 | train_loss | MeanMetric                  | 0     
2 | dev_loss   | MeanMetric                  | 0     
3 | train_mape | MeanAbsolutePercentageError | 0     
4 | dev_mape   | MeanAbsolutePercentageError | 0     
5 | train_pcc  | PearsonCorrCoef             | 0     
6 | dev_pcc    | PearsonCorrCoef             | 0     
-----------------------------------------------------------
135 M     Trainable params
0         Non-trainable params
135 M     Total params
541.563   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdhaval08[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Saving latest checkpoint...


In [26]:
! ls '/scratch/dhaval.taunk/mlns-distilbert-regressor_concat_3_not_linear/lightning-checkpoints/'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch=0-step=104.ckpt  epoch=3-step=524.ckpt  epoch=7-step=1014.ckpt
epoch=0-step=139.ckpt  epoch=3-step=559.ckpt  epoch=7-step=1049.ckpt
epoch=0-step=34.ckpt   epoch=4-step=594.ckpt  epoch=7-step=1084.ckpt
epoch=0-step=69.ckpt   epoch=4-step=629.ckpt  epoch=7-step=1119.ckpt
epoch=1-step=174.ckpt  epoch=4-step=664.ckpt  epoch=8-step=1154.ckpt
epoch=1-step=209.ckpt  epoch=4-step=699.ckpt  epoch=8-step=1189.ckpt
epoch=1-step=244.ckpt  epoch=5-step=734.ckpt  epoch=8-step=1224.ckpt
epoch=1-step=279.ckpt  epoch=5-step=769.ckpt  epoch=8-step=1259.ckpt
epoch=2-step=314.ckpt  epoch=5-step=804.ckpt  epoch=9-step=1294.ckpt
epoch=2-step=349.ckpt  epoch=5-step=839.ckpt  epoch=9-step=1329.ckpt
epoch=2-step=384.ckpt  epoc

In [27]:
chkpt_path = '/scratch/dhaval.taunk/mlns-distilbert-regressor_concat_3_not_linear/lightning-checkpoints/epoch=5-step=769.ckpt'
test_pred = trainer.predict(model, datamodule=data_module, ckpt_path=chkpt_path)
len(test_pred)

Restoring states from the checkpoint path at /scratch/dhaval.taunk/mlns-distilbert-regressor_concat_3_not_linear/lightning-checkpoints/epoch=5-step=769.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /scratch/dhaval.taunk/mlns-distilbert-regressor_concat_3_not_linear/lightning-checkpoints/epoch=5-step=769.ckpt
  rank_zero_warn(


Predicting: 1117it [00:00, ?it/s]

620

In [28]:
all_outputs = list()
for batch_outputs in test_pred:
    all_outputs.append(batch_outputs['preds'])
all_outputs = torch.cat(all_outputs, dim=0)

all_outputs.shape

torch.Size([4953, 1])

In [29]:
pd.DataFrame(list(zip(test_df.pair_id, all_outputs.squeeze(1).tolist())), columns=['pair_id', 'Overall']).to_csv('test-results.csv')

In [30]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
mlns-distilbert-regressor
mlns-distilbert-regressor_concat_3
mlns-distilbert-regressor_concat_3_not_linear
mlns-xlmr-regressor-relu
mlns-xlmr-regressor-relu-test-results.csv
results_10_epochs_distbert_x1_x2_x1-x2.zip
results.zip
test-results.csv
torch-cache
transformers


In [31]:
!zip results_10_epochs_distbert_x1_x2_x1-x2not_linear.zip test-results.csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: test-results.csv (deflated 54%)


In [32]:
! ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
mlns-distilbert-regressor
mlns-distilbert-regressor_concat_3
mlns-distilbert-regressor_concat_3_not_linear
mlns-xlmr-regressor-relu
mlns-xlmr-regressor-relu-test-results.csv
results_10_epochs_distbert_x1_x2_x1-x2not_linear.zip
results_10_epochs_distbert_x1_x2_x1-x2.zip
results.zip
test-results.csv
torch-cache
transformers


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…