### Outputs
- forward: embedding => encode => pooling
- len(outputs) : 3
- outputs[0]
  - last_hidden_state, shape: batch_size * seq_len * hidden_size (torch.Size([1, 36, 768]))
- outputs[1]
  - pooler_state, shape: batch_size * hidden_size (torch.Size([1, 768]))
  - the last layer hidden state of the first token of the sequence. ([CLS]
- outputs[2], only available when output_hidden_states = True or model.config.output_hidden_states = True
  - combination of embeddings outputs(1) + outputs of each hidden layer (12)
     - (1+12) * batch_size * seq_len * hidden_size (13*1*36*768)
- outputs[0] == outputs[2][-1]
- outputs[1] == model.pooler(outputs[2][-1])
- outputs[2][0] == model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

In [1]:
from transformers import BertModel, BertTokenizer
model_name = 'bert-base-uncased'

In [2]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states = True)

In [4]:
sent1 = 'This study provides a multivariate analysis of the availability of food store outlets in the US and'\
'associations with neighborhood characteristics on race, ethnicity and socioeconomic status'

token_input = tokenizer(sent1, return_tensors='pt')
token_input

{'input_ids': tensor([[  101,  2023,  2817,  3640,  1037,  4800, 10755, 13143,  4106,  1997,
          1996, 11343,  1997,  2833,  3573, 11730,  1999,  1996,  2149,  1998,
         12054, 10085, 18963,  2015,  2007,  5101,  6459,  2006,  2679,  1010,
         18240,  1998, 17522, 23035,  3570,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
token_input['input_ids'], token_input['input_ids'].shape

(tensor([[  101,  2023,  2817,  3640,  1037,  4800, 10755, 13143,  4106,  1997,
           1996, 11343,  1997,  2833,  3573, 11730,  1999,  1996,  2149,  1998,
          12054, 10085, 18963,  2015,  2007,  5101,  6459,  2006,  2679,  1010,
          18240,  1998, 17522, 23035,  3570,   102]]),
 torch.Size([1, 36]))

In [7]:
import torch

model.eval()
with torch.no_grad():
    outputs = model(**token_input)

In [13]:
len(outputs) 

3

In [10]:
outputs[0] == outputs[2][-1]

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])

In [23]:
outputs[0].shape, outputs[1].shape

(torch.Size([1, 36, 768]), torch.Size([1, 768]))

In [16]:
outputs[1] == model.pooler(outputs[2][-1])

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, T

In [18]:
model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.6485,  0.6739, -0.0932,  ...,  0.4475,  0.6696,  0.1820],
         [-0.0772,  0.9152, -0.2559,  ...,  0.3321,  1.2170, -0.3592],
         ...,
         [-0.6935, -0.6749, -0.4088,  ..., -0.7283,  0.3733, -1.6357],
         [ 1.0875,  0.0683,  1.1294,  ..., -0.9309,  0.7401,  0.4236],
         [-0.0908, -0.2099,  0.0628,  ..., -0.7465,  0.4288, -0.2265]]],
       grad_fn=<NativeLayerNormBackward0>)

In [20]:
outputs[2][0] ==model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])