In [49]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import TSNE
import seaborn as sns

In [9]:
input_sequence = "This article is about the online encyclopedia. For Wikipedia's home page, see Main Page. For the English edition, see English Wikipedia."
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
input_tokens = tokenizer(input_sequence, return_tensors="pt")

In [65]:
print(input_tokens)

{'input_ids': tensor([[    0,   713,  1566,    16,    59,     5,   804, 45975,     4,   286,
         28274,    18,   184,  1842,     6,   192,  4326,  7086,     4,   286,
             5,  2370,  5403,     6,   192,  2370, 28274,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}


In [15]:
print(len(input_tokens["input_ids"]))
print(len(input_tokens))
print(len(input_sequence))
print(input_tokens["input_ids"].shape)

1
2
136
torch.Size([1, 29])


In [16]:
config = RobertaConfig.from_pretrained("roberta-base")
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [17]:
model = RobertaModel.from_pretrained("roberta-base")
test_model = RobertaModel(config)

# print(model(tokenizer(input)))
print(input_tokens)
print(type(input_tokens))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[    0,   713,  1566,    16,    59,     5,   804, 45975,     4,   286,
         28274,    18,   184,  1842,     6,   192,  4326,  7086,     4,   286,
             5,  2370,  5403,     6,   192,  2370, 28274,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}
<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [18]:
output = model(**input_tokens)

In [41]:
last_hidden_state = output.last_hidden_state
print(last_hidden_state)
print(last_hidden_state.shape) # [batch_size, 문장길이, 단어벡터의 차원]
last_hidden_state_np = last_hidden_state.squeeze().detach().numpy()
print(last_hidden_state_np.shape)

tensor([[[-0.0687,  0.1148, -0.0107,  ..., -0.0608, -0.0302, -0.0069],
         [-0.0199,  0.3111,  0.1839,  ...,  0.2201,  0.1864,  0.1628],
         [-0.0253,  0.0368,  0.1066,  ..., -0.0771,  0.0607,  0.0478],
         ...,
         [-0.0151,  0.0967, -0.0219,  ...,  0.0838, -0.1042,  0.1099],
         [-0.0540,  0.1086, -0.0280,  ..., -0.0952, -0.0375, -0.0346],
         [ 0.0469,  0.1201,  0.0909,  ...,  0.0650, -0.0264,  0.0491]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 29, 768])
(29, 768)


In [50]:
n_components=2
tsne = TSNE(n_components=n_components)

In [78]:
last_hidden_state_tsne = tsne.fit_transform(last_hidden_state_np)
print(last_hidden_state_tsne)

[[ 169.28554     62.8543   ]
 [ -65.55237    -51.58612  ]
 [  16.001173   -37.516373 ]
 [  74.11719    167.63075  ]
 [   8.458613   198.87854  ]
 [  -7.0442247  101.87071  ]
 [ -11.319729   -90.55644  ]
 [-124.7632     -82.01748  ]
 [ 125.291824   117.89238  ]
 [ 129.39835    -59.046787 ]
 [ -30.500294    51.803234 ]
 [ 121.84996     13.545472 ]
 [  82.46415     59.452553 ]
 [-144.22597    -11.623636 ]
 [-147.31456     66.97384  ]
 [ -87.263084    22.025677 ]
 [  48.475094   106.53284  ]
 [ -39.06154    151.66096  ]
 [ 108.340416  -132.92804  ]
 [ -29.243252    -4.8324723]
 [  28.659584    29.939804 ]
 [  61.484184   -83.973625 ]
 [  19.125887  -206.90794  ]
 [-105.74046    170.90448  ]
 [ -83.42256     94.89753  ]
 [ 187.00021    -19.219046 ]
 [  71.10636    -18.402475 ]
 [ -66.057785  -132.78914  ]
 [  20.543941  -145.74377  ]]
