In [2]:
import logging
import sys
sys.path.append("..")
from dataclasses import dataclass, field
from typing import List, Union, Optional, Dict

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader

from transformers import (AutoConfig, AutoModelForSequenceClassification, AutoModel,
                          AutoTokenizer, PreTrainedTokenizer, PreTrainedModel)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import TrainingArguments
from datasets.siamese_dataset import SiameseGlueDataset, siamese_data_collator
from models.siamese_model import SiameseTransformer
# from models.siamese_pooling import Pooling
from core.siamese_trainer import SiameseTrainer

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
logger = logging.getLogger(__name__)

In [5]:
task_name = 'mnli'
data_dir = '/home/nlp/data/glue_data/MNLI'
model_id = 'bert-base-uncased'

In [23]:
args = DataTrainingArguments(task_name, data_dir = data_dir, max_seq_length=32)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [24]:
siamese_train_dataset = SiameseGlueDataset(args, tokenizer)

In [9]:
len(siamese_train_dataset[0][0].input_ids)

64

In [25]:
siamese_eval_dataset = SiameseGlueDataset(args, tokenizer, mode="dev")

In [11]:
from transformers import PreTrainedModel

In [12]:
from datasets.siamese_dataset import siamese_data_collator

In [31]:
train_dl = DataLoader(siamese_train_dataset,
                      batch_size=8,
                     collate_fn = siamese_data_collator)

In [32]:
batch = next(iter(train_dl))

In [38]:
@dataclass
class SiameseModelArguments:
    """
    Arguments pertaining to SiameseTransformer
    """

    model_name: str = field(
        metadata={
            "help": (
                "Path to pretrained model or model identifier from"
                " huggingface.co/models"
            )
        }
    )
    #input_dim: int = field(
    #    default=None, metadata={"help": "Input dimension of linear layer"}
    #)
    #linear_dim: int = field(
    #    default=None, metadata={"help": "Dimension of linear layer"}
    #)
    seq_len: int = field(default = 128)
    config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained config name or path if not the same as model_name"
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Where do you want to store the pretrained models downloaded from s3"
            )
        },
    )
    freeze_a: bool = field(default=False, metadata={"help": "freeze model a"})
    freeze_b: bool = field(default=False, metadata={"help": "freeze model b"})
    num_labels: int = field(default=3)

In [96]:
class PredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d((8, 128))
        self.dense = nn.Linear(4096, len(config.id2label))
                
    def forward(self, features):
        features = self.pool(features)
        print(features.shape)
        features = features.view(features.shape[0]//4, -1)
        features = self.dense(features)
        return features

In [97]:
class SiameseTransformer(nn.Module):
    def __init__(self, args, config):
        super(SiameseTransformer, self).__init__()
        self.args = args
        self.model_a = AutoModel.from_pretrained(self.args.model_name, 
                           config=config, cache_dir=self.args.cache_dir)
        self.loss_fct = nn.CrossEntropyLoss()
        self.cls = PredictionHead(config)
        
        if self.args.freeze_a:
            logger.info("**** Freezing Model A ****")
            for param in self.model_a.encoder.parameters():
                param.requires_grad = False

        if self.args.freeze_b:
            logger.info("**** Freezing Model B ****")
            for param in self.model_b.encoder.parameters():
                param.requires_grad = False
    
    def forward(self, input_a, input_b):
        labels = input_a['labels']
        input_a.pop('labels')
        input_b.pop('labels')
        output_a = self.model_a(**input_a)[0] # [bs, seq_len, 768]
        output_b = self.model_a(**input_b)[0]
        concat_output = torch.cat([output_a, output_b, (output_a-output_b), (output_a*output_b)])
        logits = self.cls(concat_output)
        loss = self.loss_fct(logits, labels)
        return loss, logits

In [306]:
from transformers.modeling_bert import ACT2FN, BertLayerNorm

In [91]:
args = SiameseModelArguments('bert-base-uncased', seq_len=64)

In [41]:
config = AutoConfig.from_pretrained(
        'bert-base-uncased',
        num_labels = 3,
        task_name = 'MNLI',
        cache_dir = '/home/nlp/experiments/siamese'
    )

In [135]:
class PredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d((8, 128))
        self.dense = nn.Linear(4096, len(config.id2label))
                
    def forward(self, features):
        features = self.pool(features)
        print(features.shape)
        features = features.view(features.shape[0]//4, -1)
        features = self.dense(features)
        return features

In [136]:
head = PredictionHeadTransform(config)

In [139]:
output = torch.rand(8, 64, 768) # 8, 3

In [140]:
head(output).shape

torch.Size([8, 8, 128])


torch.Size([2, 3])

In [297]:
head

PredictionHead(
  (dense_1): Linear(in_features=109, out_features=768, bias=True)
  (dense_2): Linear(in_features=612, out_features=3, bias=True)
  (pool1): AvgPool2d(kernel_size=7, stride=7, padding=0)
  (pool2): AvgPool2d(kernel_size=5, stride=5, padding=0)
)

In [98]:
model = SiameseTransformer(args, config)

In [99]:
batch = next(iter(train_dl))

In [100]:
for k, v in batch['a'].items():
    batch['a'][k] = v.cuda()
for k, v in batch['b'].items():
    batch['b'][k] = v.cuda()

In [101]:
model.cuda();

In [102]:
output = model(batch['a'], batch['b'])

In [104]:
output[1].shape

torch.Size([8, 3])

In [None]:
output[1]

In [99]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [95]:
from transformers import AutoModel

In [96]:
model_a = AutoModel.from_pretrained('bert-base-uncased', 
                           config=config).cuda()

In [97]:
# batch['a'].pop('labels')
output_a = model_a(**batch['a'])

In [98]:
len(output_a)

2