# 모델 및 토크나이저 정의

In [1]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForMaskedLM, AdamW, get_scheduler
import datasets
import torch
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline
from kobert_tokenizer import KoBERTTokenizer
from transformers import TrainingArguments
from transformers import Trainer
import collections
import numpy as np
from transformers import default_data_collator

In [2]:
# student 모델 및 토크나이저 불러오기
stu_checkpoint = "monologg/distilkobert"
stu_model = AutoModelForMaskedLM.from_pretrained(stu_checkpoint)
stu_tokenizer = KoBERTTokenizer.from_pretrained("skt/kobert-base-v1")
stu_makes_answers_of = pipeline(task='fill-mask', model=stu_model, tokenizer=stu_tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


# 데이터셋 로드

In [3]:
# fine-tuning용 데이터 로드
from datasets import load_dataset

clinical_dataset = load_dataset("starmpcc/Asclepius-Synthetic-Clinical-Notes")
clinical_dataset

Found cached dataset csv (/Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'note', 'question', 'answer', 'task'],
        num_rows: 158114
    })
})

# text 길이조정

In [4]:
# 문장 앞 뒤 토큰 추가 후, 텍스트 리스트화
def into_list(example):
  return {"note": ("[CLS] "+example["note"]+" [SEP]").split()}

clinical_dataset = clinical_dataset.map(into_list, remove_columns=["patient_id", "question", "answer", "task"])
clinical_dataset

Loading cached processed dataset at /Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-06fc11057ad6cd70.arrow


DatasetDict({
    train: Dataset({
        features: ['note'],
        num_rows: 158114
    })
})

In [5]:
# 길이 32으로 재구성
chunk_size = 32
def split_texts(examples):
    # 모든 텍스트들을 결합한다.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 결합된 텍스트들에 대한 길이를 구한다.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # `chunk_size`보다 작은 경우 마지막 청크를 삭제
    total_length = (total_length // chunk_size) * chunk_size
    # max_len 길이를 가지는 chunk 단위로 슬라이스
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    return result

new_datasets = clinical_dataset.map(split_texts, batched=True)
new_datasets

Loading cached processed dataset at /Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-62b2ff216bedaac8.arrow


DatasetDict({
    train: Dataset({
        features: ['note'],
        num_rows: 1380484
    })
})

# 토큰화

In [6]:
def tokenize_function(examples):
    result = stu_tokenizer(examples['note'], add_special_tokens=False)
    '''
    tokenizer(sample["text"])의 출력물:
    {'input_ids': [[input_ids_of_sample1], [input_ids_of_sample1], ...],
    'attention_mask': [[attention_masks_of_sample1], [attention_masks_of_sample1], ...]}
    '''
    result['word_ids'] = [[i]*len(result['input_ids'][i]) for i in range(32)]  # 단어 수 맞추어 숫자 변경
    return result

# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_dataset = new_datasets.map(
    tokenize_function
)
tokenized_dataset

Map:   0%|          | 0/1380484 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['note', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 1380484
    })
})

In [None]:
tokenized_dataset['train'][0]

In [7]:
def concat_texts(tokenized_dataset):
    tokenized_dataset['input_ids'] = [token for sublist in tokenized_dataset['input_ids'] for token in sublist]
    tokenized_dataset['token_type_ids'] = [token for sublist in tokenized_dataset['token_type_ids'] for token in sublist]
    tokenized_dataset['attention_mask'] = [token for sublist in tokenized_dataset['attention_mask'] for token in sublist]
    tokenized_dataset['word_ids'] = [token for sublist in tokenized_dataset['word_ids'] for token in sublist]
    tokenized_dataset['labels'] = tokenized_dataset['input_ids']
    return tokenized_dataset

concated_dataset = tokenized_dataset.map(
    concat_texts, remove_columns=["note"]
)
concated_dataset

Map:   0%|          | 0/1380484 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1380484
    })
})

In [8]:
concated_dataset['train'][0]

{'input_ids': [2,
  644,
  412,
  383,
  375,
  399,
  389,
  687,
  448,
  423,
  375,
  458,
  249,
  681,
  377,
  405,
  393,
  249,
  618,
  47,
  458,
  389,
  375,
  47,
  428,
  388,
  517,
  423,
  371,
  389,
  517,
  455,
  405,
  444,
  517,
  423,
  427,
  388,
  394,
  377,
  389,
  517,
  267,
  278,
  343,
  517,
  398,
  439,
  423,
  638,
  329,
  357,
  296,
  278,
  47,
  116,
  652,
  427,
  440,
  432,
  413,
  371,
  638,
  427,
  450,
  440,
  389,
  249,
  517,
  355,
  517,
  432,
  377,
  405,
  393,
  517,
  455,
  376,
  517,
  370,
  423,
  413,
  442,
  390,
  709,
  708,
  517,
  403,
  440,
  432,
  413,
  371,
  517,
  455,
  405,
  444,
  517,
  440,
  458,
  423,
  432,
  442,
  427,
  423,
  440,
  707,
  517,
  398,
  389,
  454,
  46,
  517,
  388,
  435,
  458,
  517,
  385,
  446,
  399,
  401,
  46,
  704,
  517,
  388,
  458,
  440,
  432,
  425,
  389,
  367,
  54,
  644,
  450,
  410,
  517,
  432,
  401,
  458,
  440,
  406,
  371,
  708,
 

# 데이터셋 저장

In [9]:
concated_dataset.save_to_disk("./datasets_32")  # 저장 경로명 변경

Saving the dataset (0/17 shards):   0%|          | 0/1380484 [00:00<?, ? examples/s]