In [4]:
import sys
sys.path.append('..')

import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoTokenizer
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import GlueDataset, default_data_collator, glue_tasks_num_labels

from tqdm import tqdm, trange

from hans.utils_hans import HansDataset, hans_processors
from models.lstm import LSTM

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
variant = 'bert-base-uncased'

In [7]:
mnli_data_args = DataTrainingArguments(task_name = 'mnli', data_dir = '/home/nlp/data/glue_data/MNLI')
hans_data_args = DataTrainingArguments(task_name = 'hans', data_dir = '/home/nlp/data/glue_data/hans')

In [8]:
num_labels = glue_tasks_num_labels[mnli_data_args.task_name]

In [9]:
config = AutoConfig.from_pretrained(variant, num_labels=num_labels)
model = AutoModel.from_pretrained(variant, config=config)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [10]:
mnli_train_dataset = GlueDataset(mnli_data_args, tokenizer)
mnli_eval_dataset = GlueDataset(mnli_data_args, tokenizer, mode="dev")
hans_eval_dataset = HansDataset(
            data_dir=hans_data_args.data_dir,
            tokenizer=tokenizer,
            task=hans_data_args.task_name,
            max_seq_length=hans_data_args.max_seq_length,
            overwrite_cache=hans_data_args.overwrite_cache,
            evaluate=True
        )

In [11]:
def hans_data_collator(features):
    """
    Data collator that removes the "pairID" key if present.
    """
    batch = default_data_collator(features)
    _ = batch.pop("pairID", None)
    return batch

In [12]:
def put_on_cuda(model, batch):
    for k,v in batch.items():
        batch[k] = v.cuda()
    return model.cuda(), batch

In [13]:
mnli_train_dl = DataLoader(mnli_train_dataset,collate_fn = default_data_collator,
                            batch_size=1024, shuffle=False, drop_last=True)
mnli_eval_dl = DataLoader(mnli_eval_dataset,collate_fn = default_data_collator,
                            batch_size=128, shuffle=False, drop_last=True)
hans_eval_dl = DataLoader(hans_eval_dataset,collate_fn = hans_data_collator,
                            batch_size=128, shuffle=False)

In [11]:
class LSTM_Config():
    def __init__(self, num_labels, n_cells, n_layers, n_embed, d_embed,
                d_proj, d_hidden, d_out=None, projection=False):
        self.d_out = num_labels
        self.n_cells = n_cells
        self.n_layers = n_layers
        self.n_layers = 1
        self.n_embed = n_embed
        self.d_embed = d_embed
        self.d_proj = d_proj
        self.d_hidden = d_hidden
        self.projection = False
        self.dp_ratio = 0.2
        self.birnn = True
        self.fix_emb = False
        self.projection = projection

In [14]:
class HEXProjection(nn.Module):
    def __init__(self, dim):
        super(HEXProjection, self).__init__()
        self.summarization_params = nn.Linear(dim, dim)
    
    def forward(self, x, y):
        prelim_loss = torch.cat([x, y])
        prelim_pred = torch.cat([torch.zeros_like(x), y])
        prelim_H = torch.cat([x, torch.zeros_like(x)])
        y_loss = self.summarization_params(prelim_loss)
        y_pred = self.summarization_params(prelim_pred)
        y_H = self.summarization_params(prelim_H)
        
        inverse_inside = torch.pinverse(torch.matmul(torch.transpose(y_H, 0, 1), y_H))
        
        y_loss = y_loss - torch.matmul(
                                torch.matmul(
                                    torch.matmul(y_H, inverse_inside), 
                                    torch.transpose(y_H, 0, 1))
                                , y_loss)
        return y_loss

In [13]:
lstm_config = LSTM_Config(num_labels = 3, 
                          n_cells = 2,
                          n_layers = 2,
                          n_embed = len(tokenizer.vocab), 
                          d_embed = 768, 
                          d_proj = 768//2, 
                          d_hidden = 768//2)

In [14]:
from models.lstm import LSTM

In [15]:
lstm = LSTM(lstm_config)

In [28]:
class OrthogonalTransformer(nn.Module):
    def __init__(self, network_a, network_b, num_labels, batch_size):
        super(OrthogonalTransformer, self).__init__()
        self.network_a = network_a
        self.network_b = network_b
        self.hex = HEXProjection(768)
        self.out_1 = nn.Linear(768, 3)
        self.out_2 = nn.Linear(batch_size*2, batch_size)

    def forward(self, batch):
        batch.pop('labels');
        output_a = self.network_a(**batch)[1]
        output_b = self.network_b(batch['input_ids'])
        projected_logits = self.hex(output_a, output_b)
        output = self.out_1(projected_logits)
        return self.out_2(output.t()).t()

In [68]:
batch = next(iter(mnli_eval_dl))

In [30]:
orthogonal_tfmr = OrthogonalTransformer(model, lstm, 3, 128)

In [31]:
orthogonal_tfmr, batch = put_on_cuda(orthogonal_tfmr, batch)

In [33]:
output = orthogonal_tfmr(batch)

In [34]:
output.shape

torch.Size([128, 3])

## Training LSTM classifier

In [13]:
from models.lstm import Bottle, Linear, Encoder

In [15]:
class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [66]:
class LSTM(nn.Module):
    def __init__(self, bert_config, config):
        super(LSTM, self).__init__()
        self.config = config
        self.embed = BertEmbeddings(bert_config)
        self.projection = Linear(config.d_embed, config.d_proj)
        self.encoder = Encoder(config)
        self.dropout = nn.Dropout(p=config.dp_ratio)
        self.relu = nn.ReLU()
        seq_in_size = config.d_hidden
        if self.config.birnn:
            seq_in_size *= 2
        lin_config = [seq_in_size]*2
        self.out = nn.Sequential(
            Linear(*lin_config),
            self.relu,
            self.dropout,
            Linear(*lin_config),
            self.relu,
            self.dropout,
            Linear(*lin_config),
            self.relu,
            self.dropout,
            Linear(seq_in_size, config.d_out))

    def forward(self, batch):
        embed = self.embed(batch['input_ids'], batch['token_type_ids'])
        if self.config.projection:
            embed = self.relu(self.projection(embed))
        print(embed.shape)
        embed = self.encoder(embed)
        print(embed.shape)
        return self.out(embed)

In [67]:
lstm_config = LSTM_Config(num_labels = 3, 
                          n_cells = 2,
                          n_layers = 4,
                          n_embed = len(tokenizer.vocab), 
                          d_embed = 768, 
                          d_proj = 768, 
                          d_hidden = 768,
                          d_out = 3,
                          projection = True)

In [68]:
lstm = LSTM(config, lstm_config)

In [69]:
optim = torch.optim.AdamW(lstm.parameters(), 1e-3)
criterion = nn.CrossEntropyLoss()
lstm.train().cuda();

In [70]:
num_epochs = 2

In [43]:
import math
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=False,
    ):
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
        # and values come from an encoder; the attention mask needs to be
        # such that the encoder's padding tokens are not attended to.
        if encoder_hidden_states is not None:
            mixed_key_layer = self.key(encoder_hidden_states)
            mixed_value_layer = self.value(encoder_hidden_states)
            attention_mask = encoder_attention_mask
        else:
            mixed_key_layer = self.key(hidden_states)
            mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs

In [66]:
class CBOW(nn.Module):
    def __init__(self, config):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = BertEmbeddings(config)
        self.attention = BertSelfAttention(config)
        self.linear1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.act_fn = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size)
        

    def forward(self, inputs):
        embeds = self.embeddings(inputs['input_ids'], inputs['token_type_ids'])
        out = self.attention(embeds)[0].sum(1)
        out = self.act_fn(out)
        out = self.linear2(out)
        return out

In [67]:
cbow = CBOW(config).cuda()

In [68]:
cbow(batch).shape

torch.Size([1024, 768])

In [64]:
optim = torch.optim.AdamW(cbow.parameters(), 1e-3)
criterion = nn.CrossEntropyLoss()
cbow.train();

In [65]:
for epoch in trange(5):
    for idx, batch in enumerate(mnli_train_dl):
        optim.zero_grad()
        for k, v in batch.items():
            batch[k] = v.cuda()

        loss = criterion(cbow(batch), batch['labels'])
        loss.backward()
        optim.step()

    print(loss.item())

 20%|██        | 1/5 [01:17<05:08, 77.06s/it]

1.149527668952942


 40%|████      | 2/5 [02:34<03:51, 77.32s/it]

1.0053470134735107


 60%|██████    | 3/5 [03:53<02:35, 77.55s/it]

0.9910863637924194


 80%|████████  | 4/5 [05:11<01:17, 77.70s/it]

0.965848445892334


100%|██████████| 5/5 [06:29<00:00, 77.80s/it]

1.0136642456054688





In [1]:
torch.nn.init.xavier_normal(nn.Linear(20, 20))

NameError: name 'torch' is not defined

In [4]:
import torch
from torch import nn

In [3]:
x = torch.rand(128, 768)
y = torch.rand(128, 768)

In [5]:
x = nn.Linear(768, 128)(x)
y = nn.Linear(768, 128)(y)

In [6]:
x.shape

torch.Size([128, 128])

In [7]:
k = torch.cat([x, y], dim=1)

In [8]:
k.shape

torch.Size([128, 256])

In [9]:
k = torch.matmul(k, k.t())

In [11]:
k.shape

torch.Size([128, 128])

In [12]:
torch.inverse(k)

tensor([[ 0.3045,  0.0074,  0.0055,  ...,  0.0111,  0.0035,  0.0050],
        [ 0.0074,  0.3605, -0.0142,  ..., -0.0242, -0.0330, -0.0365],
        [ 0.0055, -0.0142,  0.2974,  ..., -0.0293, -0.0055,  0.0318],
        ...,
        [ 0.0111, -0.0242, -0.0293,  ...,  0.2929,  0.0047, -0.0404],
        [ 0.0035, -0.0330, -0.0055,  ...,  0.0047,  0.3649, -0.0539],
        [ 0.0050, -0.0365,  0.0318,  ..., -0.0404, -0.0539,  0.3125]],
       grad_fn=<InverseBackward>)

In [15]:
k.shape

torch.Size([128, 128])

In [17]:
torch.eye(*k.shape)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [27]:
class m(nn.Module):
    def __init__(self):
        super(m, self).__init__()
        self.k = torch.eye(1)

In [28]:
M = m().cuda()

In [29]:
M.k.device

device(type='cpu')