In [1]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForMaskedLM, AdamW, get_scheduler

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm
from transformers import pipeline
from kobert_tokenizer import KoBERTTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터 셋 적재
raw_datasets = load_dataset('mitclinicalml/clinical-ie', 'coreference')

Downloading builder script: 100%|██████████| 3.76k/3.76k [00:00<00:00, 6.81MB/s]
Downloading readme: 100%|██████████| 5.53k/5.53k [00:00<00:00, 13.7MB/s]


Downloading and preparing dataset clinical-ie/coreference to /home/s1/serimkim/.cache/huggingface/datasets/mitclinicalml___clinical-ie/coreference/1.0.3/5a94c4c9014c297b41f864a521b81b02d896202de0171cc958e44805e0713fee...


Downloading data: 100%|██████████| 2.87k/2.87k [00:00<00:00, 7.84MB/s]
Downloading data: 100%|██████████| 54.4k/54.4k [00:00<00:00, 275kB/s]
Downloading data: 100%|██████████| 4.04k/4.04k [00:00<00:00, 13.1MB/s]
Downloading data: 100%|██████████| 70.4k/70.4k [00:00<00:00, 355kB/s]
Downloading data: 100%|██████████| 2.34k/2.34k [00:00<00:00, 6.45MB/s]
Downloading data: 100%|██████████| 44.7k/44.7k [00:00<00:00, 21.3MB/s]
Downloading data files: 100%|██████████| 6/6 [00:05<00:00,  1.05it/s]
Extracting data files: 100%|██████████| 6/6 [00:00<00:00, 779.47it/s]
                                                             

Dataset clinical-ie downloaded and prepared to /home/s1/serimkim/.cache/huggingface/datasets/mitclinicalml___clinical-ie/coreference/1.0.3/5a94c4c9014c297b41f864a521b81b02d896202de0171cc958e44805e0713fee. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 401.23it/s]


In [3]:
raw_datasets

DatasetDict({
    validation: Dataset({
        features: ['index', 'snippet', 'pronoun', 'antecedents', 'pronoun_sentence'],
        num_rows: 5
    })
    test: Dataset({
        features: ['index', 'snippet', 'pronoun', 'antecedents', 'pronoun_sentence'],
        num_rows: 100
    })
})

---
# teacher & student 모델, 토크나이저 불러오기

In [4]:
# bert 모델 및 토크나이저 불러오기
checkpoint = "bert-base-uncased"
bert_model = AutoModelForMaskedLM.from_pretrained(checkpoint)
bert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bert_tokenizer

Downloading: 100%|██████████| 570/570 [00:00<00:00, 1.79MB/s]
Downloading: 100%|██████████| 440M/440M [00:36<00:00, 11.9MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 41.9kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 583kB/s] 
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 1.14MB/s]


PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# teacher 모델 및 토크나이저 불러오기
teacher_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
teacher_model = AutoModelForMaskedLM.from_pretrained(teacher_checkpoint)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_checkpoint)
teacher_makes_answers_of = pipeline(task='fill-mask', model=teacher_model, tokenizer=teacher_tokenizer)
teacher_tokenizer

Downloading: 100%|██████████| 385/385 [00:00<00:00, 1.71MB/s]
Downloading: 100%|██████████| 436M/436M [00:38<00:00, 11.2MB/s]   
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 541kB/s] 


PreTrainedTokenizerFast(name_or_path='emilyalsentzer/Bio_ClinicalBERT', vocab_size=28996, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
# student 모델 및 토크나이저 불러오기
student_checkpoint = "skt/kobert-base-v1"
student_model = AutoModelForMaskedLM.from_pretrained(student_checkpoint)
student_tokenizer = KoBERTTokenizer.from_pretrained(student_checkpoint)
student_tokenizer

Downloading: 100%|██████████| 535/535 [00:00<00:00, 1.03MB/s]
Downloading: 100%|██████████| 369M/369M [00:31<00:00, 11.5MB/s] 
Some weights of BertForMaskedLM were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PreTrainedTokenizer(name_or_path='skt/kobert-base-v1', vocab_size=8002, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [7]:
# student2 모델 및 토크나이저 불러오기
stu_checkpoint = "monologg/distilkobert"
stu_model = AutoModelForMaskedLM.from_pretrained(stu_checkpoint)
stu_tokenizer = KoBERTTokenizer.from_pretrained(student_checkpoint)
stu_makes_answers_of = pipeline(task='fill-mask', model=stu_model, tokenizer=stu_tokenizer)
stu_tokenizer

PreTrainedTokenizer(name_or_path='skt/kobert-base-v1', vocab_size=8002, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [8]:
stu_model.num_parameters()/1000000
stu_tokenizer.model_max_length

1000000000000000019884624838656

In [9]:
# kmbert 모델 및 토크나이저 불러오기
km_checkpoint = "madatnlp/km-bert"
km_model = AutoModelForMaskedLM.from_pretrained(km_checkpoint)
km_tokenizer = AutoTokenizer.from_pretrained(km_checkpoint)
km_makes_answers_of = pipeline(task='fill-mask', model=km_model, tokenizer=km_tokenizer)
km_tokenizer

Downloading: 100%|██████████| 679/679 [00:00<00:00, 1.69MB/s]
Downloading: 100%|██████████| 395M/395M [00:33<00:00, 11.6MB/s] 
Some weights of BertForMaskedLM were not initialized from the model checkpoint at madatnlp/km-bert and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: 100%|██████████| 395/395 [00:00<00:00, 767kB/s]
Downloading: 100%|██████████| 104k/104k [00:00<00:00, 507kB/s] 
Downloading: 100%|██████████| 358k/358k [00:00<00:00, 902kB/s] 
Downloading: 100%|██████████| 125/125 [00:00<00:00, 509kB/s]


PreTrainedTokenizerFast(name_or_path='madatnlp/km-bert', vocab_size=16424, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

---
# soft label => student 학습용 sentence 변환

In [10]:
# 예제 생성
text = "This is a 50-year-old African American male with a history of hepatitis C."
masked_text = "This is a 50-year-old African American male with a [MASK] of hepatitis C."

In [11]:
masked_text.replace('[MASK]', '[MASK]'*3)

'This is a 50-year-old African American male with a [MASK][MASK][MASK] of hepatitis C.'

In [12]:
ta = teacher_makes_answers_of(masked_text)

In [13]:
import pandas as pd

pd.DataFrame(ta)
ta

[{'sequence': 'this is a 50 - year - old african american male with a history of hepatitis c.',
  'score': 0.9929597973823547,
  'token': 1607,
  'token_str': 'history'},
 {'sequence': 'this is a 50 - year - old african american male with a diagnosis of hepatitis c.',
  'score': 0.006644753273576498,
  'token': 12645,
  'token_str': 'diagnosis'},
 {'sequence': 'this is a 50 - year - old african american male with a type of hepatitis c.',
  'score': 0.0001249209453817457,
  'token': 2076,
  'token_str': 'type'},
 {'sequence': 'this is a 50 - year - old african american male with a background of hepatitis c.',
  'score': 4.7995472414186224e-05,
  'token': 3582,
  'token_str': 'background'},
 {'sequence': 'this is a 50 - year - old african american male with a History of hepatitis c.',
  'score': 2.931490416813176e-05,
  'token': 2892,
  'token_str': 'History'}]

In [14]:
teacher_tokenizer.convert_ids_to_tokens(teacher_tokenizer(masked_text)['input_ids'][19])   # tokenized sentence에서 19번 자리가 mask임.

'[MASK]'

In [15]:
# [MASK] token 예측
teacher_model(**teacher_tokenizer(masked_text, return_tensors='pt')).logits[0][19]

tensor([-3.2400, -4.5728, -4.8088,  ..., -3.5481, -5.4297, -5.4963],
       grad_fn=<SelectBackward0>)

In [16]:
# 각 예측값에 대한 score계산
torch.topk(torch.softmax(teacher_model(**teacher_tokenizer(masked_text, return_tensors='pt')).logits[0][19], dim=0), 5)

torch.return_types.topk(
values=tensor([9.9296e-01, 6.6448e-03, 1.2492e-04, 4.7995e-05, 2.9315e-05],
       grad_fn=<TopkBackward0>),
indices=tensor([ 1607, 12645,  2076,  3582,  2892]))

In [17]:
# top 5 token
torch.topk(teacher_model(**teacher_tokenizer(masked_text, return_tensors='pt')).logits[0][19], 5)

torch.return_types.topk(
values=tensor([19.0246, 14.0177, 10.0438,  9.0873,  8.5942], grad_fn=<TopkBackward0>),
indices=tensor([ 1607, 12645,  2076,  3582,  2892]))

In [18]:
teacher_answer = teacher_tokenizer.convert_ids_to_tokens(torch.topk(teacher_model(**teacher_tokenizer(masked_text, return_tensors='pt')).logits[0][19], 5).indices)
teacher_answer

['history', 'diagnosis', 'type', 'background', 'History']

In [19]:
[len(_) for _ in stu_tokenizer(teacher_answer)['input_ids']]

[8, 11, 7, 9, 7]

In [20]:
soft_label = teacher_makes_answers_of(masked_text)
soft_label

[{'sequence': 'this is a 50 - year - old african american male with a history of hepatitis c.',
  'score': 0.9929597973823547,
  'token': 1607,
  'token_str': 'history'},
 {'sequence': 'this is a 50 - year - old african american male with a diagnosis of hepatitis c.',
  'score': 0.006644753273576498,
  'token': 12645,
  'token_str': 'diagnosis'},
 {'sequence': 'this is a 50 - year - old african american male with a type of hepatitis c.',
  'score': 0.0001249209453817457,
  'token': 2076,
  'token_str': 'type'},
 {'sequence': 'this is a 50 - year - old african american male with a background of hepatitis c.',
  'score': 4.7995472414186224e-05,
  'token': 3582,
  'token_str': 'background'},
 {'sequence': 'this is a 50 - year - old african american male with a History of hepatitis c.',
  'score': 2.931490416813176e-05,
  'token': 2892,
  'token_str': 'History'}]

In [21]:
for i in range(len(soft_label)):


SyntaxError: unexpected EOF while parsing (2943754332.py, line 1)

In [None]:
km_tokenizer("ibuprofen")

{'input_ids': [2, 10016, 1347, 6955, 2535, 1636, 1520, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
l = ['history', 'diagnosis', 'type', 'background']

In [None]:
stu_tokenizer.convert_ids_to_tokens(stu_tokenizer(l)['input_ids'][0])

['[CLS]', '▁', 'h', 'is', 't', 'or', 'y', '[SEP]']

In [None]:
stu_makes_answers_of('this is a 50 - year - old african american male with a [MASK][MASK][MASK][MASK][MASK][MASK] of hepatitis c.')

[[{'score': 0.12830208241939545,
   'token': 517,
   'token_str': '',
   'sequence': '[CLS] this is a 50 - year - old african american male with a [MASK][MASK][MASK][MASK][MASK] of hepatitis c.[SEP]'},
  {'score': 0.018993770703673363,
   'token': 2355,
   'token_str': '보',
   'sequence': '[CLS] this is a 50 - year - old african american male with a 보[MASK][MASK][MASK][MASK][MASK] of hepatitis c.[SEP]'},
  {'score': 0.01776224933564663,
   'token': 54,
   'token_str': '.',
   'sequence': '[CLS] this is a 50 - year - old african american male with a.[MASK][MASK][MASK][MASK][MASK] of hepatitis c.[SEP]'},
  {'score': 0.014646179042756557,
   'token': 2186,
   'token_str': '바',
   'sequence': '[CLS] this is a 50 - year - old african american male with a 바[MASK][MASK][MASK][MASK][MASK] of hepatitis c.[SEP]'},
  {'score': 0.009664971381425858,
   'token': 5468,
   'token_str': '과',
   'sequence': '[CLS] this is a 50 - year - old african american male with a과[MASK][MASK][MASK][MASK][MASK] of 

In [None]:
teacher_tokenizer.convert_ids_to_tokens(teacher_tokenizer('I took a ibuprofen.')['input_ids'])

['[CLS]', 'i', 'took', 'a', 'i', '##bu', '##p', '##ro', '##fen', '.', '[SEP]']

---
# input data 가공하기

In [None]:
# 랜덤 마스킹된 예제 생성
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=teacher_tokenizer, mlm_probability=0.15, return_tensors='pt')
tokenized_ex = teacher_tokenizer(text)
example_token = data_collator([tokenized_ex])   # 토큰 형태
example = teacher_tokenizer.decode(example_token['input_ids'][0])   # [MASK] 포함된 str 형태
print(example_token,'\n마스킹전 : This is a 50-year-old African American male with a history of hepatitis C.\n 마스킹후 :', example)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  1142,  1110,   170,  1851,   118,  1214,   118,  1385,   170,
          2087,   103,  1179,  1821, 26237,  1389,  5107,  1114,   170,  1607,
          1104,  1119,  4163, 27659,   172,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]]), 'labels': tensor([[ -100,  1142,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100, 15353,  -100,  -100,  -100,  1389,  2581,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100]])} 
마스킹전 : This is a 50-year-old African American male with a history of hepatitis C.
 마스킹후 : [CLS] this is a 50 - year - old af [MASK]n american Kennedy with a history of hepatitis c. [SEP]


In [None]:
t_ans = teacher_makes_answers_of('this is a [MASK] - year - [MASK] african american male with a history of hepatitis c.')

In [None]:
stu_token = stu_tokenizer(example.replace('[CLS] ', '').replace(' [SEP]', ''))
stu_token

{'input_ids': [2, 517, 444, 412, 517, 412, 517, 367, 612, 524, 517, 458, 389, 375, 524, 517, 428, 388, 517, 367, 398, 4, 517, 425, 517, 373, 394, 406, 374, 659, 392, 425, 390, 458, 517, 455, 405, 444, 517, 367, 517, 401, 412, 442, 430, 458, 707, 517, 401, 389, 432, 377, 413, 412, 705, 54, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
stu_token['labels'] = [-100 for _ in range(len(stu_token['input_ids']))]

In [None]:
stu_token

{'input_ids': [2, 517, 444, 412, 517, 412, 517, 367, 612, 524, 517, 458, 389, 375, 524, 517, 428, 388, 517, 367, 398, 4, 517, 425, 517, 373, 394, 406, 374, 659, 392, 425, 390, 458, 517, 455, 405, 444, 517, 367, 517, 401, 412, 442, 430, 458, 707, 517, 401, 389, 432, 377, 413, 412, 705, 54, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

In [None]:
stu_token.pop('token_type_ids')
stu_token.pop('attention_mask')

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
stu_token

{'input_ids': [2, 517, 444, 412, 517, 412, 517, 367, 612, 524, 517, 458, 389, 375, 524, 517, 428, 388, 517, 367, 398, 4, 517, 425, 517, 373, 394, 406, 374, 659, 392, 425, 390, 458, 517, 455, 405, 444, 517, 367, 517, 401, 412, 442, 430, 458, 707, 517, 401, 389, 432, 377, 413, 412, 705, 54, 3], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}

In [None]:
list(filter(lambda i:stu_token['input_ids'][i] == 4, range(len(stu_token['input_ids']))))

[21]

In [None]:
stu_dict = {
    "input_ids" : [],
    "labels" : []
}

In [None]:
masked_idx = list(filter(lambda i:stu_token['input_ids'][i] == 4, range(len(stu_token['input_ids']))))
for i in range(len(t_ans)):
    mask = masked_idx[i-1]
    for j in range(5):
        token = stu_token['input_ids']
        label = stu_token['labels']
        new_ans = stu_tokenizer(t_ans[i][j]['token_str'])['input_ids']
        del new_ans[0]
        del new_ans[-1]
        token[mask:mask] = [4, 4]
        label[mask:mask] = new_ans
        stu_token['input_ids'].append(token)
        stu_token['labels'].append(label)
print(len(stu_token['input_ids']))

87


In [None]:
stu_dict['input_ids'].append(stu_token['input_ids'])

In [None]:
len(stu_dict['input_ids'])

1

In [None]:
stu_token=practice

In [None]:
practice['input_ids'][4] = torch.Tensor(4, 4)

IndexError: index 4 is out of bounds for dimension 0 with size 1

In [None]:
torch.Tensor([[-100., -100., -100., -100., -100., -100.,]])

In [None]:
stu_ex = stu_tokenizer("[MASK] is a 50 - year - [MASK] af [MASK]n american male with a history of hepatitis c.")

In [None]:
print(stu_tokenizer.convert_ids_to_tokens(stu_ex['input_ids']))

['[CLS]', '[MASK]', '▁', 'is', '▁', 'a', '▁50', '▁-', '▁', 'y', 'e', 'ar', '▁-', '[MASK]', '▁', 'a', 'f', '[MASK]', '▁', 'n', '▁', 'am', 'er', 'ic', 'an', '▁', 'm', 'al', 'e', '▁', 'w', 'i', 'th', '▁', 'a', '▁', 'h', 'is', 't', 'or', 'y', '▁of', '▁', 'h', 'e', 'p', 'at', 'it', 'is', '▁c', '.', '[SEP]']


In [None]:
stu_tokenizer.convert_ids_to_tokens([2, 517, 405, 380, 446, 432, 439, 398, 392, 3])

['[CLS]', '▁', 'i', 'b', 'u', 'p', 'ro', 'f', 'en', '[SEP]']

In [None]:
teacher_makes_answers_of('ibu[MASK][MASK]fen')

[[{'score': 0.14180906116962433,
   'token': 2087,
   'token_str': '##f',
   'sequence': '[CLS] ibuf [MASK] fen [SEP]'},
  {'score': 0.061251960694789886,
   'token': 1643,
   'token_str': '##p',
   'sequence': '[CLS] ibup [MASK] fen [SEP]'},
  {'score': 0.04479588568210602,
   'token': 185,
   'token_str': 'p',
   'sequence': '[CLS] ibu p [MASK] fen [SEP]'},
  {'score': 0.03340088948607445,
   'token': 3361,
   'token_str': '##j',
   'sequence': '[CLS] ibuj [MASK] fen [SEP]'},
  {'score': 0.0332258976995945,
   'token': 175,
   'token_str': 'f',
   'sequence': '[CLS] ibu f [MASK] fen [SEP]'}],
 [{'score': 0.07432229071855545,
   'token': 1233,
   'token_str': '##l',
   'sequence': '[CLS] ibu [MASK]l fen [SEP]'},
  {'score': 0.05445404723286629,
   'token': 118,
   'token_str': '-',
   'sequence': '[CLS] ibu [MASK] - fen [SEP]'},
  {'score': 0.04285497963428497,
   'token': 120,
   'token_str': '/',
   'sequence': '[CLS] ibu [MASK] / fen [SEP]'},
  {'score': 0.027396807447075844,
   't

In [None]:
teacher_makes_answers_of(example)

[[{'score': 0.9501201510429382,
   'token': 1114,
   'token_str': 'with',
   'sequence': '[CLS] [CLS] this is a 50 - year - old african american male with [MASK] history of he [MASK] titis c [MASK] [SEP] [SEP]'},
  {'score': 0.022348498925566673,
   'token': 192,
   'token_str': 'w',
   'sequence': '[CLS] [CLS] this is a 50 - year - old african american male w [MASK] history of he [MASK] titis c [MASK] [SEP] [SEP]'},
  {'score': 0.009296688251197338,
   'token': 117,
   'token_str': ',',
   'sequence': '[CLS] [CLS] this is a 50 - year - old african american male, [MASK] history of he [MASK] titis c [MASK] [SEP] [SEP]'},
  {'score': 0.0044685062021017075,
   'token': 172,
   'token_str': 'c',
   'sequence': '[CLS] [CLS] this is a 50 - year - old african american male c [MASK] history of he [MASK] titis c [MASK] [SEP] [SEP]'},
  {'score': 0.0025676703080534935,
   'token': 1150,
   'token_str': 'who',
   'sequence': '[CLS] [CLS] this is a 50 - year - old african american male who [MASK] 

In [None]:
print(km_tokenizer.convert_ids_to_tokens(km_tokenizer(text)['input_ids']), '\n')
print(km_tokenizer.convert_ids_to_tokens(km_tokenizer(masked_text)['input_ids']))

['[CLS]', 'Th', '##is', 'i', '##s', 'a', '50', '-', 'y', '##ea', '##r', '-', 'o', '##ld', 'A', '##f', '##ri', '##c', '##an', 'A', '##me', '##ri', '##c', '##an', 'm', '##al', '##e', 'w', '##it', '##h', 'a', 'h', '##ist', '##or', '##y', 'of', 'h', '##e', '##p', '##at', '##it', '##is', 'C', '.', '[SEP]'] 

['[CLS]', 'Th', '##is', 'i', '##s', 'a', '50', '-', 'y', '##ea', '##r', '-', 'o', '##ld', '[MASK]', 'A', '##me', '##ri', '##c', '##an', 'm', '##al', '##e', 'w', '##it', '##h', 'a', '[MASK]', 'of', 'h', '##e', '##p', '##at', '##it', '##is', 'C', '.', '[SEP]']


In [None]:
print(teacher_tokenizer.convert_ids_to_tokens(teacher_tokenizer(text)['input_ids']), '\n')
print(teacher_tokenizer.convert_ids_to_tokens(teacher_tokenizer(masked_text)['input_ids']))

['[CLS]', 'this', 'is', 'a', '50', '-', 'year', '-', 'old', 'a', '##f', '##rica', '##n', 'am', '##eric', '##an', 'male', 'with', 'a', 'history', 'of', 'he', '##pa', '##titis', 'c', '.', '[SEP]'] 

['[CLS]', 'this', 'is', 'a', '50', '-', 'year', '-', 'old', '[MASK]', 'am', '##eric', '##an', 'male', 'with', 'a', '[MASK]', 'of', 'he', '##pa', '##titis', 'c', '.', '[SEP]']


In [None]:
print(student_tokenizer.convert_ids_to_tokens(student_tokenizer(text)['input_ids']), '\n')
print(student_tokenizer.convert_ids_to_tokens(student_tokenizer(masked_text)['input_ids']))

['[CLS]', '▁T', 'h', 'is', '▁', 'is', '▁', 'a', '▁50', '-', 'y', 'e', 'ar', '-', 'ol', 'd', '▁A', 'f', 'ri', 'c', 'an', '▁A', 'm', 'er', 'ic', 'an', '▁', 'm', 'al', 'e', '▁', 'w', 'i', 'th', '▁', 'a', '▁', 'h', 'is', 't', 'or', 'y', '▁of', '▁', 'h', 'e', 'p', 'at', 'it', 'is', '▁C', '.', '[SEP]'] 

['[CLS]', '▁T', 'h', 'is', '▁', 'is', '▁', 'a', '▁50', '-', 'y', 'e', 'ar', '-', 'ol', 'd', '[MASK]', '▁A', 'm', 'er', 'ic', 'an', '▁', 'm', 'al', 'e', '▁', 'w', 'i', 'th', '▁', 'a', '[MASK]', '▁of', '▁', 'h', 'e', 'p', 'at', 'it', 'is', '▁C', '.', '[SEP]']


In [None]:
stu_token = stu_tokenizer('this is a 50 - year - old african american male [MASK] [MASK] history of he [MASK]titis')
print(stu_token)

{'input_ids': [2, 517, 444, 412, 517, 412, 517, 367, 612, 524, 517, 458, 389, 375, 524, 517, 428, 388, 517, 367, 398, 438, 382, 374, 517, 373, 394, 406, 374, 517, 423, 371, 389, 4, 4, 517, 401, 412, 442, 430, 458, 707, 517, 401, 389, 4, 517, 442, 413, 412, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
mask_list = list(filter(lambda x: stu_token['input_ids'][x] == 4, range(len(stu_token['input_ids']))))
print(mask_list)

[33, 34, 45]


In [None]:
stu_token['input_ids'] == [103,]

False

In [None]:
torch.topk(teacher_model(**teacher_tokenizer(text, return_tensors="pt"))['logits'][0][9], k=5, dim =0)

torch.return_types.topk(
values=tensor([8.6365, 7.0715, 7.0263, 6.8887, 6.7199], grad_fn=<TopkBackward0>),
indices=tensor([ 117, 2581, 1653, 3120, 1938]))

---
# 데이터 전처리

In [None]:
tokenized_datasets = teacher_tokenizer(raw_datasets['test']['snippet'])
tokenized_datasets_eval = teacher_tokenizer(raw_datasets['validation']['snippet'])

In [None]:
tokenized_datasets

{'input_ids': [[101, 12398, 23897, 131, 122, 119, 15242, 4412, 15684, 17713, 185, 119, 184, 119, 171, 119, 178, 119, 173, 119, 123, 119, 3073, 22834, 7614, 1162, 126, 17713, 185, 119, 184, 119, 186, 119, 170, 119, 182, 119, 124, 119, 3073, 22834, 7614, 1162, 128, 119, 126, 17713, 185, 119, 184, 119, 186, 13282, 125, 119, 2608, 2087, 2260, 17713, 185, 119, 184, 119, 171, 119, 178, 119, 173, 119, 126, 119, 1139, 18389, 11708, 189, 2180, 4386, 122, 185, 119, 184, 119, 186, 119, 178, 119, 173, 119, 127, 119, 171, 11179, 10205, 188, 1116, 122, 27629, 1830, 185, 119, 184, 119, 186, 1285, 128, 119, 176, 1389, 6617, 1665, 14185, 3161, 2260, 17713, 185, 119, 184, 119, 189, 119, 178, 119, 173, 119, 129, 119, 1231, 4027, 1320, 1405, 17713, 185, 119, 184, 119, 186, 119, 177, 119, 188, 119, 130, 119, 172, 11194, 20192, 1476, 17713, 185, 119, 184, 119, 186, 1285, 1275, 119, 1884, 17510, 1851, 17713, 185, 119, 184, 119, 186, 1285, 185, 119, 187, 119, 183, 119, 14255, 2050, 9717, 1891, 2812, 1146, 131

In [None]:
#전체 텍스트를 chunk_size로 쪼갬

chunk_size = 128

def group_texts(examples):
    # 모든 텍스트들을 결합한다.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 결합된 텍스트들에 대한 길이를 구한다.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    print('total_length의 길이 :',total_length)
    # `chunk_size`보다 작은 경우 마지막 청크를 삭제
    total_length = (total_length // chunk_size) * chunk_size
    # max_len 길이를 가지는 chunk 단위로 슬라이스
    result = {
        k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 새로운 레이블 컬럼을 생성
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = group_texts(tokenized_datasets)
lm_datasets

total_length의 길이 : 10326


{'input_ids': [[101,
   12398,
   23897,
   131,
   122,
   119,
   15242,
   4412,
   15684,
   17713,
   185,
   119,
   184,
   119,
   171,
   119,
   178,
   119,
   173,
   119,
   123,
   119,
   3073,
   22834,
   7614,
   1162,
   126,
   17713,
   185,
   119,
   184,
   119,
   186,
   119,
   170,
   119,
   182,
   119,
   124,
   119,
   3073,
   22834,
   7614,
   1162,
   128,
   119,
   126,
   17713,
   185,
   119,
   184,
   119,
   186,
   13282,
   125,
   119,
   2608,
   2087,
   2260,
   17713,
   185,
   119,
   184,
   119,
   171,
   119,
   178,
   119,
   173,
   119,
   126,
   119,
   1139,
   18389,
   11708,
   189,
   2180,
   4386,
   122,
   185,
   119,
   184,
   119,
   186,
   119,
   178,
   119,
   173,
   119,
   127,
   119,
   171,
   11179,
   10205,
   188,
   1116,
   122,
   27629,
   1830,
   185,
   119,
   184,
   119,
   186,
   1285,
   128,
   119,
   176,
   1389,
   6617,
   1665,
   14185,
   3161,
   2260,
   17713,
   185,
  

In [None]:
lm_datasets_eval = group_texts((tokenized_datasets_eval))

total_length의 길이 : 553


In [None]:
pad_collator = DataCollatorWithPadding(tokenizer=teacher_tokenizer)

In [None]:
pad_collator(tokenized_datasets,)

{'input_ids': tensor([[  101, 12398, 23897,  ...,     0,     0,     0],
        [  101,  1119,  2886,  ...,     0,     0,     0],
        [  101,   123,   119,  ...,     0,     0,     0],
        ...,
        [  101,  3463,  2781,  ...,     0,     0,     0],
        [  101,  3463,  2781,  ...,     0,     0,     0],
        [  101, 16679,   117,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
from datasets import Dataset
import datasets

dataset_dict = datasets.DatasetDict({"train":Dataset.from_dict(lm_datasets), "test":Dataset.from_dict(lm_datasets_eval)})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

---
# dataset dict 만들기

In [None]:
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

dataset_dict = datasets.DatasetDict({"train":Dataset.from_dict(lm_datasets), "test":Dataset.from_dict(lm_datasets_eval)})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

---
# model 훈련 준비

In [None]:
# 각 종류별 데이터 로더 생성
train_dataloader = DataLoader(dataset_dict["train"],
                              shuffle=True,
                              batch_size=8,
                              collate_fn=data_collator)
eval_dataloader = DataLoader(dataset_dict["test"],
                             shuffle=True,
                             batch_size=8,
                             collate_fn=data_collator)

In [None]:
dataset_dict = dataset_dict.rename_column('token_type_ids', 'masked_sentence')

In [None]:
data_collator([dataset_dict['train'][0]['input_ids']])

{'input_ids': tensor([[  101, 12398,   103,   131,   122,   119, 15242,  4412, 15684, 17713,
            185,   119,   103,   103,   103,   119,   103,   119,   173,   119,
            123,   119,  3073, 22834,   103,  1162,   126, 17713,   185,   119,
            184,   119,   186,   119,   170,   119,   182,   119,   124,   119,
            103,   103,  7614,  1162,   128,   119,   126, 17713,   185,   119,
            103,   119,   186, 13282,   125,   119,  2608,  2087,  2260, 17713,
            103,   119,   184,   119,   171,   119,   178,   119,   173,   119,
            126,   119,  1139, 18389,   103,   189,  2180, 25237,   122,   185,
            119,   184,   103,   186,   103,   178,   119,   173,   103,   103,
            103,   171, 11179,   103,   188,  1116,   122, 27629, 23565,   185,
            119,   103,   119,   186,  1285,   128,   119,   103,  1389,  6617,
           1665, 14185,  3161,  2260, 17713,   185,   119,   184,   119,   189,
            103,   178,   1

In [None]:
def random_masking(example):
    example['input_ids'] = data_collator([example['input_ids']])['input_ids']
    example['masked_sentence'] = teacher_tokenizer.decode(example['input_ids'])

In [None]:
masked_dict = dataset_dict['train'].map(random_masking)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [None]:
masked_dict

NameError: name 'masked_dict' is not defined

In [None]:
masked_dict['train'][0]

{'input_ids': [101,
  12398,
  23897,
  131,
  122,
  119,
  15242,
  4412,
  15684,
  17713,
  185,
  119,
  184,
  119,
  171,
  119,
  178,
  119,
  173,
  119,
  123,
  119,
  3073,
  22834,
  7614,
  1162,
  126,
  17713,
  185,
  119,
  184,
  119,
  186,
  119,
  170,
  119,
  182,
  119,
  124,
  119,
  3073,
  22834,
  7614,
  1162,
  128,
  119,
  126,
  17713,
  185,
  119,
  184,
  119,
  186,
  13282,
  125,
  119,
  2608,
  2087,
  2260,
  17713,
  185,
  119,
  184,
  119,
  171,
  119,
  178,
  119,
  173,
  119,
  126,
  119,
  1139,
  18389,
  11708,
  189,
  2180,
  4386,
  122,
  185,
  119,
  184,
  119,
  186,
  119,
  178,
  119,
  173,
  119,
  127,
  119,
  171,
  11179,
  10205,
  188,
  1116,
  122,
  27629,
  1830,
  185,
  119,
  184,
  119,
  186,
  1285,
  128,
  119,
  176,
  1389,
  6617,
  1665,
  14185,
  3161,
  2260,
  17713,
  185,
  119,
  184,
  119,
  189,
  119,
  178,
  119,
  173,
  119,
  129,
  119,
  1231],
 'masked_sentence': [0,
  0,
  0

In [None]:
for batch in eval_dataloader:
    for i in range(4):
        print(teacher_makes_answers_of(batch['input_ids'][0]))

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x160291940>

In [None]:
# 최적화 함수 정의
optimizer = AdamW(student_model.parameters(), lr=5e-5)
# 에포크 개수 설정
num_epochs = 3
# 학습 스텝 수 계산
num_training_steps = num_epochs * len(train_dataloader)
# 학습 스케쥴러 설정
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# GPU로 모델을 이동
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
student_model.to(device)

# 진행 상황바 정의
progress_bar = tqdm(range(num_training_steps))



  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# 모델을 학습 모드로 전환
student_model.train()
# 학습 루프 시작
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 현재 배치 중에서 입력값을 모두 GPU로 이동.
        batch = {k: v.to(device) for k, v in batch.items()}
        # 모델 실행
        outputs = student_model(**batch)
        # 손실값 가져오기
        loss = outputs.loss   # loss값에 score곱하기
        # 역전파 수행
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# 평가 메트릭 가져오기
metric = load_metric('mitclinicalml/clinical-ie', 'coreference')
# 모델을 평가 모드로 전환
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

In [None]:
# 평가 결과 계산 및 출력
metric.compute()

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')