In [59]:
from torch import nn
import torch

In [60]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
model = AutoModel.from_pretrained("dbmdz/bert-base-german-uncased")

Some weights of the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
from read_data import read_data_sets
dataset = read_data_sets()
train = dataset.train
val = dataset.validation

In [62]:
tokenizer.tokenize(train.words[3])

['schwarz', '##burg', '##under']

In [63]:
train.words[0]

'magistratsamt'

In [64]:
tokens = tokenizer.encode(train.words[3], return_tensors = "pt")

In [65]:
test = model(tokens)

In [23]:
test['last_hidden_state'].shape

torch.Size([1, 5, 768])

In [26]:
test['pooler_output'].shape

torch.Size([1, 768])

In [148]:
class GenusBertOneMatr(nn.Module):
    def __init__(self, hidden_size=768, num_heads = 1, genus_count = 3):
        super(GenusBertOneMatr, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first = True)
        self.genus_embs = nn.Parameter(nn.init.normal_(torch.empty(1, genus_count, hidden_size))) # one vector per genus
        
        self.out = nn.Linear(self.hidden_size, 1) # projecting one vector per genus to logit 
        self.sigmoid = nn.Sigmoid()

    def forward(self, input):
        X, weights = self.attn(self.genus_embs, input, input) # 3 x hidden_size
        return self.out(X), weights
        
        

In [149]:
genus_model = GenusBertOneMatr()

In [150]:
genus_model(test['last_hidden_state'])

(tensor([[[ 0.0048],
          [-0.0560],
          [-0.0171]]], grad_fn=<AddBackward0>),
 tensor([[[0.1424, 0.2137, 0.2742, 0.2269, 0.1428],
          [0.2231, 0.1973, 0.1546, 0.2051, 0.2199],
          [0.1701, 0.3033, 0.1826, 0.1774, 0.1667]]], grad_fn=<DivBackward0>))

In [45]:
test['last_hidden_state'].shape

torch.Size([1, 5, 768])

In [66]:
tmp = nn.MultiheadAttention(768, 1)

In [68]:
x = test['last_hidden_state']
tmp(x,x,x)

(tensor([[[-0.6550,  0.1419,  0.2060,  ..., -0.5220,  0.0300, -0.1792],
          [ 0.2017, -0.4177, -0.2337,  ...,  0.1948,  0.2709,  0.1184],
          [ 0.3893, -0.0604,  0.1256,  ...,  0.1840,  0.3574,  0.2834],
          [-0.0193, -0.2267, -0.4263,  ...,  0.2137,  0.5133,  0.2211],
          [-0.6675,  0.1415,  0.2633,  ..., -0.5262,  0.0356, -0.1859]]],
        grad_fn=<AddBackward0>),
 tensor([[[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]]], grad_fn=<DivBackward0>))