In [1]:
import logging
import sys
sys.path.append("..")
from dataclasses import dataclass, field
from typing import List, Union, Optional, Dict, Callable

import numpy as np
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader

from transformers import (AutoConfig, AutoModelForSequenceClassification, AutoModel,
                          AutoTokenizer, PreTrainedTokenizer, PreTrainedModel)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import TrainingArguments, default_data_collator, EvalPrediction, GlueDataset
from datasets.siamese_dataset import SiameseGlueDataset, siamese_data_collator
from models.siamese_model import SiameseTransformer
from core.siamese_trainer import SiameseTrainer
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
logger = logging.getLogger(__name__)

In [4]:
task_name = 'mnli'
data_dir = '/home/nlp/data/glue_data/MNLI'
model_id = 'bert-base-uncased'

In [27]:
data_args = DataTrainingArguments(task_name, data_dir = data_dir, max_seq_length=128)
mm_data_args = DataTrainingArguments('mnli-mm', data_dir = data_dir, max_seq_length=128)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
siamese_train_dataset = SiameseGlueDataset(data_args, tokenizer)

In [8]:
siamese_eval_dataset = SiameseGlueDataset(data_args, tokenizer, mode="dev")

In [9]:
from transformers import PreTrainedModel

In [10]:
from datasets.siamese_dataset import siamese_data_collator

In [11]:
train_dl = DataLoader(siamese_train_dataset,
                      batch_size=256,
                     collate_fn = siamese_data_collator, shuffle=True)

In [12]:
eval_dl = DataLoader(siamese_train_dataset,
                      batch_size=256,
                     collate_fn = siamese_data_collator)

In [13]:
for inputs in train_dl:
    k1 = inputs['a']['input_ids']
    k2 = inputs['b']['input_ids']
    break

In [14]:
k1.shape

torch.Size([256, 32])

In [15]:
@dataclass
class SiameseModelArguments:
    """
    Arguments pertaining to SiameseTransformer
    """

    model_name: str = field(
        metadata={
            "help": (
                "Path to pretrained model or model identifier from"
                " huggingface.co/models"
            )
        }
    )
    #input_dim: int = field(
    #    default=None, metadata={"help": "Input dimension of linear layer"}
    #)
    #linear_dim: int = field(
    #    default=None, metadata={"help": "Dimension of linear layer"}
    #)
    # seq_len: int = field(default = 128)
    config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained config name or path if not the same as model_name"
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Where do you want to store the pretrained models downloaded from s3"
            )
        },
    )
    freeze_a: bool = field(default=False, metadata={"help": "freeze model a"})
    freeze_b: bool = field(default=False, metadata={"help": "freeze model b"})
    num_labels: int = field(default=3)
    batch_size: int = field(default=128)

In [16]:
class PredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d((8, 128))
        self.dense = nn.Linear(4096, len(config.id2label))
                
    def forward(self, features):
        features = self.pool(features)
        print(features.shape)
        features = features.view(features.shape[0]//4, -1)
        features = self.dense(features)
        return features

In [17]:
# class SiameseTransformer(nn.Module):
#     def __init__(self, args, config):
#         super(SiameseTransformer, self).__init__()
#         self.args = args
#         self.model_a = AutoModelForSequenceClassification.from_pretrained(self.args.model_name, 
#                            config=config, cache_dir=self.args.cache_dir)
#         self.loss_fct = nn.CrossEntropyLoss()
#         #self.cls = PredictionHeadTransform(config)
#         self.cls = nn.Linear(512, len(config.id2label))
#         # self.pool = nn.AdaptiveAvgPool2d((args.batch_size, len(config.id2label)))
#         #if self.args.freeze_a:
#         #    logger.info("**** Freezing Model A ****")
#         #    for param in self.model_a.encoder.parameters():
#         #        param.requires_grad = False

#         #if self.args.freeze_b:
#         #    logger.info("**** Freezing Model B ****")
#         #    for param in self.model_b.encoder.parameters():
#         #        param.requires_grad = False
    
#     def forward(self, a, b):
#         #labels = input_a['labels']
#         #input_a.pop('labels')
#         #input_b.pop('labels')
#         output_a = self.model_a(**a) # [bs, seq_len, 768]
#         output_b = self.model_a(**b)
#         outputs = []
#         outputs.append(output_a[0]+output_b[0])
#         concat_output = torch.cat([output_a[1], output_b[1]])
#         concat_output = concat_output.view(3, -1)
#         logits = self.cls(concat_output)
#         outputs.append(logits)
#         #loss = self.loss_fct(logits, labels)
#         return outputs

In [19]:
args = SiameseModelArguments('bert-base-uncased', batch_size=2048)

In [20]:
config = AutoConfig.from_pretrained(
        'bert-base-uncased',
        num_labels = 3,
        task_name = 'MNLI',
        cache_dir = '/home/nlp/experiments/siamese'
    )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [20]:
# head = PredictionHeadTransform(config)
# output = torch.rand(8, 64, 768) # 8, 3
# head(output).shape
# head

In [21]:
class SiameseTransformer(nn.Module):
    def __init__(self, args, config):
        super(SiameseTransformer, self).__init__()
        self.args = args
        self.model_a = AutoModelForSequenceClassification.from_pretrained(
            self.args.model_name, config=config, cache_dir=self.args.cache_dir
        )
        self.model_b = AutoModelForSequenceClassification.from_pretrained(
            self.args.model_name, config=config, cache_dir=self.args.cache_dir
        )

        self.loss_fct = nn.CrossEntropyLoss()
        # self.cls = PredictionHeadTransform(config)
        # self.cls = nn.Linear(len(config.id2label), len(config.id2label))
        # if self.args.freeze_a:
        #    logger.info("**** Freezing Model A ****")
        #    for param in self.model_a.encoder.parameters():
        #        param.requires_grad = False

        # if self.args.freeze_b:
        #    logger.info("**** Freezing Model B ****")
        #    for param in self.model_b.encoder.parameters():
        #        param.requires_grad = False

    def forward(self, a, b):
        # labels = input_a['labels']
        # input_a.pop('labels')
        # input_b.pop('labels')
        output_a = self.model_a(**a)  # [bs, seq_len, 768]
        output_b = self.model_b(**b)
        outputs = []
        for i in range(len(output_a)):
            outputs.append(output_a[i] + output_b[i])

        # concat_output = torch.cat([output_a[1], output_b[1]])
        # logits = self.cls(concat_output)
        # outputs.append(logits)
        # loss = self.loss_fct(logits, labels)
        return outputs

In [22]:
model = SiameseTransformer(args, config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
batch = next(iter(train_dl))

In [23]:
# batch_eval = next(iter(eval_dl))

In [24]:
for k, v in batch['a'].items():
    batch['a'][k] = v.cuda()
for k, v in batch['b'].items():
    batch['b'][k] = v.cuda()

In [25]:
model.cuda();

In [26]:
output = model(**batch)

In [27]:
output[1].shape

torch.Size([3, 3])

In [41]:
ckpt = torch.load('/home/nlp/experiments/siamese/pytorch_model.bin')

In [42]:
model.load_state_dict(ckpt['model_state_dict'])

<All keys matched successfully>

In [32]:
output[1].shape

torch.Size([1024, 3])

In [81]:
model(**batch_eval)

(tensor(1.1629, device='cuda:0', grad_fn=<NllLossBackward>),
 tensor([[ 0.1034, -0.1250, -0.1374],
         [-0.2326, -0.4631,  0.0924],
         [-0.5709, -0.4091,  0.1518],
         [-0.4632, -0.3334,  0.1755],
         [-0.0604, -0.2989,  0.0755],
         [-0.5815, -0.2825,  0.1942],
         [-0.2083, -0.3863,  0.1067],
         [-0.2706, -0.2518, -0.0087]], device='cuda:0', grad_fn=<AddmmBackward>))

In [103]:
output[0]

tensor(2.0891, device='cuda:0', grad_fn=<AddBackward0>)

In [23]:
from transformers import Trainer, glue_compute_metrics
from core.siamese_trainer import SiameseTrainer

In [24]:
training_args = TrainingArguments(output_dir = '/home/nlp/experiments/siamese/',
                                 do_eval = True,
                                 per_device_train_batch_size=1024,
                                 per_device_eval_batch_size=1024)

In [25]:
output_mode = "classification"

In [26]:
def build_compute_metrics_fn(task_name: str,) -> Callable[[EvalPrediction], Dict]:
    def compute_metrics_fn(p: EvalPrediction) -> Dict:
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

    return compute_metrics_fn

trainer = SiameseTrainer(
    model=model,
    args=training_args,
    train_dataset=siamese_train_dataset,
    eval_dataset=siamese_eval_dataset,
    data_collator=siamese_data_collator,
    compute_metrics=build_compute_metrics_fn(data_args.task_name))

wandb: ERROR Error uploading "wandb-metadata.json": CommError, /tmp/tmp_g8zo58zwandb/1uh8tge9-wandb-metadata.json is an empty file


In [44]:
build_compute_metrics_fn('mnli')

<function __main__.build_compute_metrics_fn.<locals>.compute_metrics_fn(p: transformers.trainer_utils.EvalPrediction) -> Dict>

In [28]:
mm_siamese_eval_dataset = SiameseGlueDataset(data_args, tokenizer, mode="dev")

In [29]:
trainer.evaluate(mm_siamese_eval_dataset)

Evaluation: 100%|██████████| 5/5 [00:25<00:00,  5.00s/it]

9815 9815





{'eval_loss': 2.286630964279175, 'eval_mnli/acc': 0.32786551197147223}

In [36]:
len(siamese_eval_dataset)

9815

In [95]:
from transformers import AutoModel

In [96]:
model_a = AutoModel.from_pretrained('bert-base-uncased', 
                           config=config).cuda()

In [97]:
# batch['a'].pop('labels')
output_a = model_a(**batch['a'])

In [98]:
len(output_a)

2