# Preprocessing for Korean/English Medical Term Embedding

In [1]:
import random
import itertools
import pandas as pd
from tqdm.auto import tqdm

### (1) KOSTOM Preprocessing

In [2]:
# KOSMOTM 데이터를 불러들인다. 
with open('kostom.txt', 'r', encoding='cp949') as f:
    file = f.readlines()

In [3]:
print('Data Size: ', len(file))
print('Data Sample: ', file[1])

Data Size:  339200
Data Sample:  H00000018	H00000018	A Ab:Pr:Pt:Ser/Plas:Ord	A 항체:존재:검사시점:혈청/혈장:순위척도	C0482065			817-7					



In [4]:
# 영어 데이터와 한국어 데이터를 구분하여 리스트에 담는다. 
cleaned_kostom_ENG, cleaned_kostom_KOR, cleaned_kostom_none = [], [], []

for i in tqdm(file[1:]):  # file[0] is a header.
    lst = i.rstrip("\n").split("\t")
    cui = lst[4]
    eng = lst[2].replace('\"', '')
    kor = lst[3].replace('\"', '')
        
    if cui == '': # umls코드가 부여되지 않은 경우 별도의 리스트에 담는다.
        row = eng.lower() + "||" + kor
        cleaned_kostom_none.append(row)
        continue

    row_eng = cui + "||" + eng.lower()
    row_kor = cui + "||" + kor
    
    cleaned_kostom_ENG.append(row_eng) 
    cleaned_kostom_KOR.append(row_kor)
    
print('ENG: ', len(cleaned_kostom_ENG))
print('KOR: ', len(cleaned_kostom_KOR))
print('NO_CODE: ', len(cleaned_kostom_none))

  0%|          | 0/339199 [00:00<?, ?it/s]

ENG:  207835
KOR:  207835
NO_CODE:  131364


In [5]:
# 중복 데이터를 제거한다. 
cleaned_dedup_kostom_ENG = list(set(cleaned_kostom_ENG))
cleaned_dedup_kostom_KOR = list(set(cleaned_kostom_KOR))
cleaned_dedup_kostom_none = list(set(cleaned_kostom_none))

print(len(cleaned_dedup_kostom_ENG))
print(len(cleaned_dedup_kostom_KOR))
print(len(cleaned_dedup_kostom_none))

173267
166566
131308


In [6]:
print(cleaned_dedup_kostom_ENG[0])
print(cleaned_dedup_kostom_KOR[0])
print(cleaned_dedup_kostom_none[0])

C0263151||cellulitis of perineum
C0546910||분지
spleen and stomach dampness-heat pattern||비위습열증


### (2) Positive Pairs Generation

In [7]:
umls_ENG_KOR = cleaned_dedup_kostom_ENG + cleaned_dedup_kostom_KOR

In [8]:
# cui를 기준으로 딕셔너리를 생성한다.
umls_dict = {}
for line in tqdm(umls_ENG_KOR):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/339833 [00:00<?, ?it/s]

In [9]:
def gen_pairs(input_list):
    return list(itertools.combinations(input_list, r=2))

In [10]:
# 동일한 cui의 병명을 pari로 구성한다.
random.seed(42)

pos_pairs = []
for k,v in tqdm(umls_dict.items()):
    pairs = gen_pairs(v)
    if len(pairs) > 50: # if >50 pairs, then trim to 50 pairs
        pairs = random.sample(pairs, 50)
    for p in pairs:
        p_0 = p[0].replace(',', '').replace('\'', '').replace('\"', '')
        p_1 = p[1].replace(',', '').replace('\'', '').replace('\"', '')
        # additional preprocessing
        keywords = ['n/a', 'nan', 'null', '']
        if p_0 not in keywords and p_1 not in keywords:
            line = [p_0, p_1, str(k)]
            pos_pairs.append(line)

  0%|          | 0/117830 [00:00<?, ?it/s]

In [11]:
print(len(pos_pairs))

421914


In [12]:
pos_pairs[0]

['cellulitis of perineum', '회음의 연조직염', 'C0263151']

### (3) Save File

In [13]:
# Dataframe으로 변환 후 csv 파일로 저장.
random.shuffle(pos_pairs)
df = pd.DataFrame(pos_pairs, columns=['sent0', 'sent1', 'label'])
df.head()

Unnamed: 0,sent0,sent1,label
0,도약맥박,센맥박,C0425574
1,요관 결찰 제거,요관 묶기 제거,C0177019
2,synotia,합이증,C0266677
3,novolin:mcnc:pt:ser/plas:qn,velosulin:mcnc:pt:ser/plas:qn,C0800147
4,피막절개(술),수정체낭절개(술),C3701588


In [14]:
df.to_csv('kostom_pair.csv', sep='\t', index=False)