In [None]:
# BigBird 모형 : 더 긴 입력 시퀀스를 처리할 수 있는 모형

# 2048 토큰 처리(BERT의 4배, 512x4)

In [1]:
from transformers import BigBirdTokenizer, BigBirdForMaskedLM
import torch

# 모델과 토크나이저 불러오기
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForMaskedLM.from_pretrained('google/bigbird-roberta-base')
model

Downloading spiece.model: 100%|██████████| 846k/846k [00:00<00:00, 66.8MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)cial_tokens_map.json: 100%|██████████| 775/775 [00:00<00:00, 194kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.02k/1.02k [00:00<00:00, 81.9kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 760/760 [00:00<00:00, 152kB/s]
Downloading pytorch_model.bin: 100%|██████████| 513M/513M [00:04<00:00, 113MB/s] 


BigBirdForMaskedLM(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0-11): 12 x BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
           

In [2]:
inputs = ["I like reading [MASK].", "I like driving a [MASK].","The world is facing with a [MASK] [MASK] crisis. We are all suffering from infectious diseases."]

answers = ["I like reading book.", "I like driving a car.", "The world is facing with a pandemic crisis. We are all suffering from infectious diseases."]

In [3]:
encoded_inputs = []
encoded_labels =  []

for i, l in zip(inputs, answers):
    encoded_inputs.append(tokenizer(i, return_tensors="pt"))
    encoded_labels.append(tokenizer(l, return_tensors="pt")["input_ids"])

encoded_inputs

[{'input_ids': tensor([[  65,  415,  689, 3656,   67,  865,   66]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  65,  415,  689, 5160,  358,   67,  865,   66]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[   65,   484,  1096,   419,  6577,   452,   358,    67,    67,  5003,
            114,   876,   490,   578,  7296,   523, 25347, 10141,   114,    66]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}]

In [4]:
encoded_labels

[tensor([[  65,  415,  689, 3656, 1593,  114,   66]]),
 tensor([[  65,  415,  689, 5160,  358, 1198,  114,   66]]),
 tensor([[   65,   484,  1096,   419,  6577,   452,   358, 19899,  5415,  5003,
            114,   876,   490,   578,  7296,   523, 25347, 10141,   114,    66]])]

In [5]:
#추론 모드로 실행

for input, label in zip(encoded_inputs, encoded_labels):
    outputs = model(**input, labels=label)
    loss = outputs.loss
    logits = outputs.logits
    print(f"loss：{loss.item()}")
    print(f"prediction：{' '.join([tokenizer.decode(logits[0][i].argmax(-1)) for i in range(1, len(logits[0]))])}")
    print(f"answer：{tokenizer.decode(label[0][1:-1])}")
    print('\n')

Attention type 'block_sparse' is not possible if sequence_length: 7 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


loss：11.183554649353027
prediction：i like reading it . i
answer：I like reading book.


loss：8.30691146850586
prediction：its like driving a car . a
answer：I like driving a car.


loss：4.29605770111084
prediction：the world is facing with a global health crisis . we are all suffering from infectious diseases . .
answer：The world is facing with a pandemic crisis. We are all suffering from infectious diseases.




In [6]:
# Pegasus : 문장 요약에 특화된 사전 학습 모형, 구글 2020 발표

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# 모델명
model_name = 'google/pegasus-xsum'
device = 'cpu'

# 모델 및 토크나이저 불러오기
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading (…)ve/main/spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 4.90MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 16.3kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 87.0/87.0 [00:00<00:00, 14.5kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 189kB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.28G/2.28G [00:21<00:00, 108MB/s] 
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)neration_config.json: 100%|██████████| 259/259 [00:00<00:00, 21.7kB/s]


In [7]:
inputs = [
          """
          Pretraining large neural language models, such as BERT, has led to impressive gains on many natural language processing (NLP) tasks. However, most pretraining efforts focus on general domain corpora, such as newswire and Web. A prevailing assumption is that even domain-specific pretraining can benefit by starting from general-domain language models. Recent work shows that for domains with abundant unlabeled text, such as biomedicine, pretraining language models from scratch results in substantial gains over continual pretraining of general-domain language models.
          """]

batch = tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").to(device)

In [8]:
# 요약 문장 생성

translated = model.generate(**batch)

generated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

print(generated_text[0])

Pretraining large neural language models can lead to substantial gains over continual pretraining of general-domain language models.
