In [1]:
import datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForMaskedLM, AdamW, get_scheduler

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm
from transformers import pipeline
import random


In [175]:
ds = datasets.load_from_disk('datasets_by_teacher_1m')

In [10]:
import pandas as pd
df = pd.read_csv('./ds/dataset_0.3m.csv')
df

Unnamed: 0,new_text,mask_idx,score
0,[CLS] Discharge Summary: Patient: 60-year-old ...,9,0.378695
1,[CLS] Discharge Summary: Patient: 60-year-old ...,9,0.252060
2,COVID-19 Hospital Course: The patient was admi...,6,0.825758
3,COVID-19 Hospital Course: The patient was take...,6,0.082363
4,"with symptoms of fever, with cough, and dyspne...",4,0.157553
...,...,...,...
599815,artery from an assistant port. The thrombus wa...,2,0.027182
599816,tightly adhered to inferior vena cava wall. It...,3,0.472514
599817,tightly adhered to superior vena cava wall. It...,3,0.353571
599818,dissected free from the IVC wall with local sp...,6,0.418808


# text를 길이(단어 기준) 10으로 자르기

In [2]:
# fine-tuning용 데이터 로드
clinical_dataset = datasets.load_dataset("starmpcc/Asclepius-Synthetic-Clinical-Notes")
clinical_dataset

Found cached dataset csv (/Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'note', 'question', 'answer', 'task'],
        num_rows: 158114
    })
})

In [3]:
clinical_dataset['train'][0]['note']

"Discharge Summary:\n\nPatient: 60-year-old male with moderate ARDS from COVID-19\n\nHospital Course:\n\nThe patient was admitted to the hospital with symptoms of fever, dry cough, and dyspnea. During physical therapy on the acute ward, the patient experienced coughing attacks that induced oxygen desaturation and dyspnea with any change of position or deep breathing. To avoid rapid deterioration and respiratory failure, a step-by-step approach was used for position changes. The breathing exercises were adapted to avoid prolonged coughing and oxygen desaturation, and with close monitoring, the patient managed to perform strength and walking exercises at a low level. Exercise progression was low initially but increased daily until hospital discharge to a rehabilitation clinic on day 10.\n\nClinical Outcome:\n\nThe patient was discharged on day 10 to a rehabilitation clinic making satisfactory progress with all symptoms resolved.\n\nFollow-up:\n\nThe patient will receive follow-up care at

In [88]:
# 문장 앞 뒤 토큰 추가 후, 텍스트 리스트화
def into_list(example):
  return {"note": ("[CLS] "+example["note"]+" [SEP]").split()}

clinical_dataset = clinical_dataset.map(into_list, remove_columns=["patient_id", "question", "answer", "task"])
clinical_dataset

Loading cached processed dataset at /Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-06fc11057ad6cd70.arrow


DatasetDict({
    train: Dataset({
        features: ['note'],
        num_rows: 158114
    })
})

In [89]:
# 길이 10으로 재구성
chunk_size = 10
def split_texts(examples):
    # 모든 텍스트들을 결합한다.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 결합된 텍스트들에 대한 길이를 구한다.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # `chunk_size`보다 작은 경우 마지막 청크를 삭제
    total_length = (total_length // chunk_size) * chunk_size
    # max_len 길이를 가지는 chunk 단위로 슬라이스
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    return result

datasets_len10 = clinical_dataset.map(split_texts, batched=True)
datasets_len10

Loading cached processed dataset at /Users/serimkim/.cache/huggingface/datasets/starmpcc___csv/starmpcc--Asclepius-Synthetic-Clinical-Notes-819afb51239148d3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3eb4b2f90b04ca8c.arrow


DatasetDict({
    train: Dataset({
        features: ['note'],
        num_rows: 4417713
    })
})

In [None]:
ds

In [137]:
len(datasets_len10['train'])

4417713

# 단어 자체가 토큰인 것 masking

In [90]:
# teacher 모델 및 토크나이저 불러오기
teacher_checkpoint = "medicalai/ClinicalBERT"
t_model = AutoModelForMaskedLM.from_pretrained(teacher_checkpoint)
t_tokenizer = AutoTokenizer.from_pretrained(teacher_checkpoint)
t_pipeline = pipeline(task='fill-mask', model=t_model, tokenizer=t_tokenizer)

KeyboardInterrupt: 

In [None]:
ex = datasets_len10['train'][0]
ex

In [None]:
result = t_tokenizer(ex['note'], add_special_tokens=False)
result

In [None]:
def masking(examples):
    examples['input_ids'] = t_tokenizer(examples['note'], add_special_tokens=False)['input_ids']
    examples['word_token_idx'] = [idx for idx, tokens in enumerate(examples['input_ids']) if len(tokens)==1]
    examples['mask_idx'] = random.choice(examples['word_token_idx']) if len(examples['word_token_idx'])>0 else -1
    if examples['mask_idx'] != -1:
        examples['note'][examples['mask_idx']] = '[MASK]'
    examples['text'] = ' '.join(examples['note'])
    return examples

In [None]:
input_data = [[11170, 32194, 118, 10270], [18141], [15348, 131], [10105], [38607], [10134], [40345], [10114], [10105], [18141]]
indices_of_len_one = [idx for idx, tokens in enumerate(input_data) if len(tokens)==1]

print(indices_of_len_one)

In [None]:
ex['note'][3]

In [None]:
sample = datasets_len10['train'].select(list(range(3)))
sample.map(masking)

In [None]:
processed = sample.map(masking)

In [None]:
processed[0]

In [None]:
datasets_len10_mask = datasets_len10['train'].map(masking)

# 데이터셋 저장

In [None]:
datasets_len10_mask.save_to_disk("./datasets_len10_mask")  # 저장 경로명 변경

In [4]:
ds = datasets.load_from_disk('datasets_len10_mask')

In [9]:
text = 'COVID-19 Hospital Course: The patient was admitted to the hospital'
text

'COVID-19 Hospital Course: The patient was admitted to the hospital'

In [6]:
ds[1]

{'note': ['COVID-19',
  'Hospital',
  'Course:',
  'The',
  'patient',
  'was',
  '[MASK]',
  'to',
  'the',
  'hospital'],
 'input_ids': [[11170, 32194, 118, 10270],
  [18141],
  [15348, 131],
  [10105],
  [38607],
  [10134],
  [40345],
  [10114],
  [10105],
  [18141]],
 'word_token_idx': [1, 3, 4, 5, 6, 7, 8, 9],
 'mask_idx': 6,
 'text': 'COVID-19 Hospital Course: The patient was [MASK] to the hospital'}

# teacher 모델의 답변으로 데이터셋 만들기

### 1) pipeline 통한 답변생성 -> 느림

In [None]:
def make_answer(examples):
    if examples['mask_idx'] != -1:
        answers = t_pipeline(examples['text'])
        for ans in answers:
            ans['mask_idx'] = examples['mask_idx']
        examples['answers'] = answers
    return examples

In [None]:
datasets = processed.map(make_answer)

## 2) model 통한 답변생성

In [177]:
dataset = ds.to_pandas()
dataset

Unnamed: 0,mask_idx,text
0,9,[CLS] Discharge Summary: Patient: 60-year-old ...
1,6,COVID-19 Hospital Course: The patient was [MAS...
2,4,"with symptoms of fever, [MASK] cough, and dysp..."
3,3,"therapy on the [MASK] ward, the patient experi..."
4,8,that induced oxygen desaturation and dyspnea w...
...,...,...
999995,0,[MASK] of Admission: Date of Discharge: Hospit...
999996,5,was admitted with a cellulitis-like [MASK] on ...
999997,7,"thigh that rapidly extended, resulting in seve..."
999998,4,the lower right leg. [MASK] Doppler ultrasound...


In [186]:
import pandas as pd

In [241]:
import time

start_time = time.time()
for i in range(100):
    if ds.loc[i,'mask_idx'] != -1:
        # MASK 위치 인덱스 가져오기
        tokenized = t_tokenizer(ds.loc[i,'text'], return_tensors='pt', add_special_tokens=False)
        mask = torch.where(tokenized['input_ids'][0] == 103)

        # top5 예측의 score, index
        top5 = torch.topk(torch.softmax(t_model(**tokenized).logits[0][mask], dim=1), 5)
        scores = top5.values[0].detach().cpu().numpy()
        indices = top5.indices[0]

        # top5 token decode
        answers = t_tokenizer.convert_ids_to_tokens(indices)

        for j in range(5):
            s_col = 'score'+str(j)
            a_col = 'answer'+str(j)
            ds.loc[i,s_col] = scores[j]
            ds.loc[i,a_col] = answers[j]

    if i%1000 == 0:
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds for {i} rows")

        if i%10000 == 0:
            # GPU 캐시 비우기
            torch.cuda.empty_cache()

        # 중간저장
        if i%200000==0 and i>0:
            file = 'datasets_by_teacher_'+str(i+1000000)+'.csv'
            dataset.to_csv(file, index=False)

# 데이터셋 저장
dataset.to_csv('datasets_by_teacher_1000000.csv', index=False)

Execution time: 0.05159282684326172 seconds for 0 rows


In [19]:
dataset = pd.read_csv('datasets_by_teacher_1000000.csv')
dataset[1:2]

  dataset = pd.read_csv('datasets_by_teacher_1000000.csv')


Unnamed: 0,mask_idx,text,scores,score0,answer0,score1,answer1,score2,answer2,score3,answer3,score4,answer4
1,6,COVID-19 Hospital Course: The patient was [MAS...,,0.825775,admitted,0.082366,taken,0.018949,brought,0.017375,transferred,0.006739,sent


In [243]:
print(dataset[:10])

   mask_idx                                               text  scores  \
0         9  [CLS] Discharge Summary: Patient: 60-year-old ...     NaN   
1         6  COVID-19 Hospital Course: The patient was [MAS...     NaN   
2         4  with symptoms of fever, [MASK] cough, and dysp...     NaN   
3         3  therapy on the [MASK] ward, the patient experi...     NaN   
4         8  that induced oxygen desaturation and dyspnea w...     NaN   
5         8  position or deep breathing. To avoid rapid det...     NaN   
6         6  failure, a step-by-step approach was used [MAS...     NaN   
7         4  breathing exercises were adapted [MASK] avoid ...     NaN   
8         2  desaturation, and [MASK] close monitoring, the...     NaN   
9         6  strength and walking exercises at a [MASK] lev...     NaN   

     score0   answer0    score1     answer1    score2    answer2    score3  \
0  0.378700       and  0.252071           .  0.190173          ,  0.028065   
1  0.825775  admitted  0.0823

In [132]:
def make_answers(examples):
    if examples['mask_idx'] != -1:
        # MASK 위치 인덱스 가져오기
        mask = torch.where(t_tokenizer(examples['text'], return_tensors='pt', add_special_tokens=False)['input_ids'][0] == 103)

        # top5 예측의 score, index
        top5 = torch.topk(torch.softmax(t_model(**t_tokenizer(examples['text'], return_tensors='pt', add_special_tokens=False)).logits[0][mask], dim=1), 3)
        examples['scores'] = top5.values[0]
        indices = top5.indices[0]

        # top3 token decode
        examples['answers'] = t_tokenizer.convert_ids_to_tokens(indices)

    else:
        examples['scores'] = []
        examples['answers'] = []
    return examples

In [119]:
# 불필요한 컬럼 삭제
datasets_mask = datasets_len10_mask.remove_columns(['note', 'input_ids', 'word_token_idx'])

In [133]:
# 데이터셋 생성
datasets_by_teacher = datasets_mask.map(make_answers, num_proc=4)

Map (num_proc=4):   0%|          | 0/4417713 [00:00<?, ? examples/s]

TimeoutError: 

In [125]:
# 데이터셋 저장
datasets_mask.save_to_disk("./datasets_for_teacher")  # 저장 경로명 변경

Saving the dataset (0/1 shards):   0%|          | 0/4417713 [00:00<?, ? examples/s]

# df->dataset 재구성

In [248]:
df = pd.read_csv('datasets_by_teacher_1m.csv')
df

Unnamed: 0,mask_idx,text,score0,answer0,score1,answer1,score2,answer2,score3,answer3,score4,answer4
0,9,[CLS] Discharge Summary: Patient: 60-year-old ...,0.378695,and,0.252060,.,0.190167,",",0.028064,on,0.024408,with
1,6,COVID-19 Hospital Course: The patient was [MAS...,0.825758,admitted,0.082363,taken,0.018948,brought,0.017375,transferred,0.006738,sent
2,4,"with symptoms of fever, [MASK] cough, and dysp...",0.157553,with,0.156345,",",0.080044,or,0.052322,a,0.034958,and
3,3,"therapy on the [MASK] ward, the patient experi...",0.147441,left,0.056515,second,0.043400,right,0.035449,ward,0.034946,first
4,8,that induced oxygen desaturation and dyspnea w...,0.326227,evidence,0.306204,signs,0.076941,symptoms,0.042288,history,0.036518,episodes
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,[MASK] of Admission: Date of Discharge: Hospit...,0.790553,.,0.019301,:,0.014404,the,0.004306,",",0.004152,was
999996,5,was admitted with a cellulitis-like [MASK] on ...,0.275740,mass,0.257761,infection,0.033955,plaque,0.030711,pressure,0.027997,area
999997,7,"thigh that rapidly extended, resulting in seve...",0.143695,back,0.115882,chest,0.093451,severe,0.064571,chronic,0.057187,leg
999998,4,the lower right leg. [MASK] Doppler ultrasound...,0.114812,:,0.099318,leg,0.095888,le,0.080828,right,0.077498,rue


In [258]:
columns = ['new_text', 'mask_idx', 'score']  # 열 이름 리스트
new_df = pd.DataFrame(columns=columns)

start_time = time.time()
for i in range(len(df)):
    if df.loc[i, 'mask_idx'] != -1 :
        # 새로운 행 추가 - append() 메서드 사용
        new_data= pd.DataFrame({'new_text': [df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer0'])),
                                               df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer1']))],
                    'mask_idx': [df.loc[i, 'mask_idx'],
                                 df.loc[i, 'mask_idx']],
                     'score': [df.loc[i, 'score0'],
                               df.loc[i, 'score1']]})
        new_df = pd.concat([new_df, new_data])
    else:
        data = pd.DataFrame({'new_text': [df.loc[i, 'text']],
                    'mask_idx': [-1],
                     'score': [1]})
        new_df = pd.concat([new_df, data])
    if i%100000 == 0:
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds for {i} rows")
new_df

Execution time: 0.001013040542602539 seconds for 0 rows
Execution time: 118.27883791923523 seconds for 100000 rows
Execution time: 525.9422867298126 seconds for 200000 rows
Execution time: 1243.1712789535522 seconds for 300000 rows


KeyboardInterrupt: 

In [None]:
# 데이터셋 저장
new_df.to_csv('dataset_1m.csv', index=False)

# 2번 set

In [3]:
import pandas as pd
import time
df = pd.read_csv('./ds/datasets_by_teacher_4.4m.csv')
df

Unnamed: 0,mask_idx,text,score0,answer0,score1,answer1,score2,answer2,score3,answer3,score4,answer4
0,3,and drooping of [MASK] right eyelid for 15 day...,0.954090,the,0.013937,his,0.005939,a,0.005516,her,0.004411,your
1,3,"patient did not [MASK] significant headache, v...",0.265444,have,0.232882,report,0.209363,show,0.087093,feel,0.017529,describe
2,1,Diagnosis [MASK] Treatment: A neuro-ophthalmol...,0.175667,and,0.088278,.,0.078149,:,0.058106,received,0.045106,the
3,4,a complete palpebral ptosis [MASK] exotropy in...,0.216419,or,0.202470,with,0.111909,and,0.080687,.,0.042019,:
4,1,"and [MASK] severe limitation of supraductions,...",0.269346,:,0.106502,with,0.047375,.,0.044775,to,0.035465,-
...,...,...,...,...,...,...,...,...,...,...,...,...
417708,7,developed a recurrence. Instructions: No speci...,0.825241,were,0.057743,was,0.027482,being,0.025993,.,0.016457,:
417709,0,[MASK] patient at the time of discharge. Follo...,0.613106,.,0.039369,:,0.010836,the,0.010803,",",0.010693,。
417710,0,[MASK] was not scheduled for any follow-up app...,0.759907,.,0.014353,:,0.006382,",",0.006075,。,0.003577,;
417711,6,patient was advised to consult his [MASK] care...,0.982037,primary,0.003331,health,0.000982,next,0.000766,third,0.000681,first


In [4]:
columns = ['new_text', 'mask_idx', 'score']  # 열 이름 리스트
new_df2 = pd.DataFrame(columns=columns)
l = [200000]
start_time = time.time()

for i in range(len(df)):
    if i%50000 == 0:
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds for {i} rows")
        if i in l:
            file = 'dataset_4.2m.csv'
            new_df2.to_csv(file, index=False)
            columns = ['new_text', 'mask_idx', 'score']  # 열 이름 리스트
            new_df2 = pd.DataFrame(columns=columns)
    if df.loc[i, 'mask_idx'] != -1 :
        # 새로운 행 추가 - append() 메서드 사용
        new_data= pd.DataFrame({'new_text': [df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer0'])),
                                               df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer1']))],
                    'mask_idx': [df.loc[i, 'mask_idx'],
                                 df.loc[i, 'mask_idx']],
                     'score': [df.loc[i, 'score0'],
                               df.loc[i, 'score1']]})
        new_df2 = pd.concat([new_df2, new_data])
    else:
        data = pd.DataFrame({'new_text': [df.loc[i, 'text']],
                    'mask_idx': [-1],
                     'score': [1]})
        new_df2 = pd.concat([new_df2, data])
new_df2
# 데이터셋 저장
new_df2.to_csv('dataset_4.4m.csv', index=False)

Execution time: 0.00019407272338867188 seconds for 0 rows
Execution time: 29.54005002975464 seconds for 50000 rows
Execution time: 128.19266510009766 seconds for 100000 rows
Execution time: 312.4938871860504 seconds for 150000 rows
Execution time: 581.35422706604 seconds for 200000 rows
Execution time: 613.023521900177 seconds for 250000 rows
Execution time: 716.5208170413971 seconds for 300000 rows
Execution time: 907.316260099411 seconds for 350000 rows
Execution time: 1175.179306268692 seconds for 400000 rows


# 3번 셋

In [None]:
df = pd.read_csv('datasets_by_teacher_3m.csv')

In [None]:

columns = ['new_text', 'mask_idx', 'score']  # 열 이름 리스트
new_df3 = pd.DataFrame(columns=columns)

start_time = time.time()
for i in range(len(df)):
    if df.loc[i, 'mask_idx'] != -1 :
        # 새로운 행 추가 - append() 메서드 사용
        new_data= pd.DataFrame({'new_text': [df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer0'])),
                                               df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer1']))],
                    'mask_idx': [df.loc[i, 'mask_idx'],
                                 df.loc[i, 'mask_idx']],
                     'score': [df.loc[i, 'score0'],
                               df.loc[i, 'score1']]})
        new_df3 = pd.concat([new_df3, new_data])
    else:
        data = pd.DataFrame({'new_text': [df.loc[i, 'text']],
                    'mask_idx': [-1],
                     'score': [1]})
        new_df3 = pd.concat([new_df3, data])
new_df3
# 데이터셋 저장
new_df3.to_csv('dataset_3m.csv', index=False)

# 4번 set

In [None]:
df = pd.read_csv('datasets_by_teacher_4m.csv')

In [None]:

columns = ['new_text', 'mask_idx', 'score']  # 열 이름 리스트
new_df4 = pd.DataFrame(columns=columns)

start_time = time.time()
for i in range(len(df)):
    if df.loc[i, 'mask_idx'] != -1 :
        # 새로운 행 추가 - append() 메서드 사용
        new_data= pd.DataFrame({'new_text': [df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer0'])),
                                               df.loc[i, 'text'].replace('[MASK]', str(df.loc[i, 'answer1']))],
                    'mask_idx': [df.loc[i, 'mask_idx'],
                                 df.loc[i, 'mask_idx']],
                     'score': [df.loc[i, 'score0'],
                               df.loc[i, 'score1']]})
        new_df4 = pd.concat([new_df4, new_data])
    else:
        data = pd.DataFrame({'new_text': [df.loc[i, 'text']],
                    'mask_idx': [-1],
                     'score': [1]})
        new_df4 = pd.concat([new_df4, data])
new_df4
# 데이터셋 저장
new_df4.to_csv('dataset_4m.csv', index=False)