## ES Check

In [1]:
import os

from elasticsearch import Elasticsearch

ES_SERVER_HOST = os.getenv("ES_SERVER_HOST", "3.35.110.161")
ES_SERVER_PORT = int(os.getenv("ES_SERVER_PORT", "9200"))
ES_SERVER_USERNAME = os.getenv("ES_SERVER_USERNAME", "elastic")
ES_SERVER_PASSWORD = os.getenv("ES_SERVER_PASSWORD", "snomed")


es_client = Elasticsearch(
    f"http://{ES_SERVER_HOST}:{ES_SERVER_PORT}",
    basic_auth=(ES_SERVER_USERNAME, ES_SERVER_PASSWORD),
    request_timeout=60,
)

In [2]:
info = es_client.info()
print("Cluster name:", info.get("cluster_name"))
print("Version:", info.get("version", {}).get("number"))

OMOP_INDEX_NAMES = ["concept-small", "concept-relationship", "concept-synonym"]
indices = [name for name in OMOP_INDEX_NAMES if es_client.indices.exists(index=name)]
print(f"\nOMOP 인덱스 수: {len(indices)}")
print("인덱스:", indices)

Cluster name: docker-cluster
Version: 9.0.0

OMOP 인덱스 수: 3
인덱스: ['concept-small', 'concept-relationship', 'concept-synonym']


In [3]:
import pandas as pd

OMOP_INDEX_NAMES = ["concept-small", "concept-relationship", "concept-synonym"]
rows = []
for index_name in OMOP_INDEX_NAMES:
    try:
        r = es_client.cat.indices(
            index=index_name,
            format="json",
            s="index",
            bytes="mb",
        )
        rows.extend(r)
    except Exception:
        rows.append({
            "index": index_name,
            "pri": "-",
            "rep": "-",
            "docs.count": "(429)",
            "store.size": "(circuit_breaker)",
        })
if not rows:
    df = pd.DataFrame(columns=["index_name", "shards", "replicas", "docs", "store_mb"])
else:
    df = pd.DataFrame(rows)[["index", "pri", "rep", "docs.count", "store.size"]]
    df.rename(
        columns={
            "index": "index_name",
            "pri": "shards",
            "rep": "replicas",
            "docs.count": "docs",
            "store.size": "store_mb",
        },
        inplace=True,
    )
df

Unnamed: 0,index_name,shards,replicas,docs,store_mb
0,concept-small,1,0,2049978,7996
1,concept-relationship,1,0,0,0
2,concept-synonym,1,0,460999,67


## CONCEPT

In [1]:
import pandas as pd

df = pd.read_csv("/Users/rose/omop-mapper/CONCEPT.csv", sep="\t")
df.head()

  df = pd.read_csv("/Users/rose/omop-mapper/CONCEPT.csv", sep="\t")


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [2]:
df['standard_concept'].unique()

array(['S', nan, 'C'], dtype=object)

In [3]:
df['standard_concept'].isna().sum()

5442908

In [5]:
df[df['concept_id'] == 4314450]

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
7745376,4314450,Decompression,Procedure,SNOMED,Procedure,S,424128006,20070131,20991231,


In [4]:
len(df)

13436329

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9922358 entries, 0 to 9922357
Data columns (total 10 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   concept_id        int64 
 1   concept_name      object
 2   domain_id         object
 3   vocabulary_id     object
 4   concept_class_id  object
 5   standard_concept  object
 6   concept_code      object
 7   valid_start_date  int64 
 8   valid_end_date    int64 
 9   invalid_reason    object
dtypes: int64(3), object(7)
memory usage: 757.0+ MB


In [7]:
df[df["concept_id"] == 1177353]

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
1861536,1177353,Posterior openbite,Meas Value,LOINC,Answer,S,LA29512-3,19700101,20991231,


In [4]:
len(df)

9922358

In [12]:
df[df["concept_name"] == "Mineralocorticoid Receptor Antagonists"]

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
2283216,19388264,Mineralocorticoid Receptor Antagonists,Drug,MeSH,Main Heading,,D000451,20230501,20991231,
3704223,4324562,Mineralocorticoid Receptor Antagonists,Drug,NDFRT,Mechanism of Action,,N0000000139,19700101,20991231,


In [7]:
df[df["concept_name"] == "April 1, 2025"]

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason


In [6]:
# 도메인별 데이터 수 계산
domain_counts = df["domain_id"].value_counts()

# 결과 출력
print("도메인별 데이터 수:")
for domain, count in domain_counts.items():
    print(f"{domain}: {count:,}개")

도메인별 데이터 수:
Drug: 4,772,484개
Observation: 376,134개
Condition: 268,040개
Device: 237,173개
Geography: 204,037개
Measurement: 190,196개
Procedure: 101,081개
Spec Anatomic Site: 41,129개
Meas Value: 25,681개
Metadata: 4,820개
Note: 4,153개
Type Concept: 3,807개
Unit: 2,490개
Specimen: 2,096개
Provider: 1,562개
Language: 879개
Visit: 718개
Revenue Code: 545개
Race: 519개
Relationship: 419개
Route: 218개
Currency: 180개
Payer: 168개
Cost: 51개
Condition Status: 22개
Episode: 18개
Gender: 15개
Plan Stop Reason: 13개
Plan: 11개
Condition/Meas: 8개
Meas Value Operator: 7개
Sponsor: 6개
Ethnicity: 2개
Condition/Device: 1개


In [4]:
from elasticsearch import Elasticsearch

# --- Elasticsearch 연결 정보 수정 ---
ES_HOST = "http://3.35.110.161:9200"
ES_USER = "elastic"
ES_PASSWORD = "snomed"
# ------------------------------------

try:
    # Elasticsearch 클라이언트 생성
    es = Elasticsearch([ES_HOST], basic_auth=(ES_USER, ES_PASSWORD), request_timeout=30)

    # 클러스터의 모든 노드 정보 가져오기
    # 'settings'와 'roles' 정보만 필터링하여 요청
    nodes_info = es.nodes.info(metric=["settings", "roles"])

    print("--- Cluster Node Roles ---")

    ml_node_found = False
    for node_id, info in nodes_info["nodes"].items():
        node_name = info["name"]
        roles = info["roles"]

        print(f"Node Name: {node_name}")
        print(f"  - Roles: {roles}")

        if "ml" in roles:
            ml_node_found = True
            print("  - ✅ This node is an ML node.")
        else:
            print("  - ❌ This node is NOT an ML node.")
        print("-" * 20)

    if not ml_node_found:
        print("\n[CRITICAL] No ML nodes found in the cluster!")
        print(
            "Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it."
        )
    else:
        print("\n[INFO] ML node(s) found in the cluster.")

except Exception as e:
    print(f"An error occurred: {e}")

--- Cluster Node Roles ---

[CRITICAL] No ML nodes found in the cluster!
Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it.


In [26]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
)
model = AutoModel.from_pretrained(
    "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
).cuda()

# replace with your own list of entity names
all_names = [
    "atherosclerotic cardiovascular disease",
    "arteriosclerotic vascular disease",
    "coronary arteriosclerosis",
    "coronary atherosclerosis",
]

bs = 128  # batch size during inference
all_embs = []
for i in tqdm(np.arange(0, len(all_names), bs)):
    toks = tokenizer.batch_encode_plus(
        all_names[i : i + bs],
        padding="max_length",
        max_length=25,
        truncation=True,
        return_tensors="pt",
    )
    toks_cuda = {}
    for k, v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:, 0, :]  # use CLS representation as the embedding
    all_embs.append(cls_rep.cpu().detach().numpy())

all_embs = np.concatenate(all_embs, axis=0)

100%|██████████| 1/1 [00:00<00:00, 87.01it/s]


In [27]:
all_embs

array([[-0.05868842,  1.675866  , -0.80016625, ..., -0.31863979,
         1.2234459 ,  0.191829  ],
       [ 0.02161197,  1.1546385 , -0.94404846, ..., -0.3482951 ,
         0.79622304,  0.53705436],
       [-0.03819424,  0.9151199 , -0.35215873, ..., -0.11694469,
         1.6491382 ,  0.062584  ],
       [-0.02391003,  1.3203456 , -0.30787063, ..., -0.23993327,
         1.7730023 ,  0.00394338]], shape=(4, 768), dtype=float32)

In [28]:
from numpy import dot
from numpy.linalg import norm

base = all_embs[0]
other_embs = all_embs[1:]

cos_similarities = []
for emb in other_embs:
    sim = dot(base, emb) / (norm(base) * norm(emb))
    cos_similarities.append(sim)

cos_similarities  # 리스트 형태로 출력

[np.float32(0.79835176), np.float32(0.7449248), np.float32(0.86204785)]

In [16]:
def normalize_text_for_similarity(text: str) -> str:
    import re

    # 하이픈, 언더스코어를 공백으로 변환
    text = text.replace("-", " ").replace("_", " ")
    # 연속된 공백을 하나로 통합
    text = re.sub(r"\s+", " ", text)
    # 앞뒤 공백 제거
    text = text.strip()
    return text

In [17]:
def get_ngrams(text: str, n: int = 3) -> set:
    if len(text) < n:
        return {text}

    ngrams = set()
    for i in range(len(text) - n + 1):
        ngrams.add(text[i : i + n])

    return ngrams

In [18]:
def calculate_text_similarity(entity_name: str, concept_name: str) -> float:
    if not entity_name or not concept_name:
        return 0.0

    # 대소문자 정규화
    entity_name_lower = entity_name.lower()
    concept_name_lower = concept_name.lower()

    # 특수문자 정규화 (하이픈, 언더스코어를 공백으로)
    entity_name_normalized = normalize_text_for_similarity(entity_name_lower)
    concept_name_normalized = normalize_text_for_similarity(concept_name_lower)

    # N-gram 3으로 분할
    entity_ngrams = get_ngrams(entity_name_normalized, n=3)
    concept_ngrams = get_ngrams(concept_name_normalized, n=3)

    if not entity_ngrams or not concept_ngrams:
        return 0.0

    # concept_name과의 Jaccard 유사도 계산
    intersection = entity_ngrams.intersection(concept_ngrams)
    union = entity_ngrams.union(concept_ngrams)
    similarity = len(intersection) / len(union) if union else 0.0

    return similarity

In [None]:
calculate_text_similarity("acute coronary syndrome", "acute coronary syndromes")

0.9545454545454546

## CONCEPT_RELATIONSHIP

In [1]:
import pandas as pd

df = pd.read_csv("/Users/rose/omop-mapper/data/omop-cdm/CONCEPT_RELATIONSHIP.csv", sep="\t")
df

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,42073232,42073232,Mapped from,20190221,20991231,
1,42073232,42073232,Maps to,20190221,20991231,
2,42073236,42073236,Mapped from,20190221,20991231,
3,42073236,42073236,Maps to,20190221,20991231,
4,42073241,42073241,Mapped from,20190221,20991231,
...,...,...,...,...,...,...
54749163,42073208,42073208,Maps to,20190221,20991231,
54749164,42073211,42073211,Mapped from,20190221,20991231,
54749165,42073211,42073211,Maps to,20190221,20991231,
54749166,42073223,42073223,Mapped from,20190221,20991231,


In [2]:
target = [
    'Maps to',
    'Concept alt_to to',
    'Concept poss_eq to',
    'Concept same_as to',
    'Marketed form of',
    'Tradename of',
    'Box of',
    'Has quantified form'
]

df = df[df['relationship_id'].isin(target)]
print(len(df))

11462395


In [4]:
df.to_csv(
    "/Users/rose/omop-mapper/CONCEPT_RELATIONSHIP.csv", index=False
)

In [63]:
len(df[df['relationship_id'] == 'Has quantified form'])

367789

In [4]:
df[df["relationship_id"] == "Concept poss_eq to"]

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
10064,42538256,35615059,Concept poss_eq to,20190131,20991231,
11550,42575018,42573798,Concept poss_eq to,20181001,20991231,
310691,4002296,4272500,Concept poss_eq to,20210930,20991231,
310693,4007953,37152719,Concept poss_eq to,20220228,20991231,
310694,3571729,4232397,Concept poss_eq to,20230927,20991231,
...,...,...,...,...,...,...
54450884,40643366,4308141,Concept poss_eq to,20170731,20991231,
54450886,40646314,4030364,Concept poss_eq to,20170731,20991231,
54450902,40649995,201801,Concept poss_eq to,20170731,20991231,
54450905,40650869,4181356,Concept poss_eq to,20170731,20991231,


In [5]:
len(df)

54749168

## CONCEPT_SYNONYMS

In [47]:
import pandas as pd

df = pd.read_csv("/Users/rose/omop-mapper/data/omop-cdm/CONCEPT_SYNONYM.csv", sep="\t")
df.head()

Unnamed: 0,concept_id,concept_synonym_name,language_concept_id
0,3019950,CELL MARKERS; Immune globulin A; Immunoglobuli...,4180186
1,3019950,Lymphs IgA/lymph NFr Spec,4180186
2,3019951,Finding; Findings; nursing; Ordinal; Point in ...,4180186
3,3019955,pCO2 Bld Postductal TcCo2,4180186
4,3019955,Bld Postductal; Bld.foot; BldA.foot; Blood; Ca...,4180186


In [48]:
len(df)

3940029

In [25]:
df[df["concept_id"] == 40481919]

Unnamed: 0,concept_id,concept_synonym_name,language_concept_id
3177777,40481919,Atherosclerosis of coronary artery,4180186
3177778,40481919,Atherosclerosis of coronary artery (disorder),4180186
3177779,40481919,Atherosclerosis of native coronary artery,4180186


## SNOMED CT dataset

In [9]:
import pandas as pd

note = pd.read_csv(
    "/home/work/skku/hyo/MapOMOP/data/snomed-ct-entity-challenge/mimic-iv_notes_test_set.csv"
)
ann = pd.read_csv(
    "/home/work/skku/hyo/MapOMOP/data/snomed-ct-entity-challenge/test_annotations.csv"
)

In [10]:
note

Unnamed: 0,note_id,text
0,10043750-DS-6,\nName: ___ Unit No: ___...
1,10371195-DS-9,\nName: ___ Unit No: ___\n ...
2,10794068-DS-18,\nName: ___ Unit No: ___\...
3,10839721-DS-9,\nName: ___ Unit No: ...
4,10848570-DS-12,\nName: ___ Unit No: ___\n...
...,...,...
63,18454624-DS-16,\nName: ___ Unit No: ___\n \...
64,18570237-DS-10,\nName: ___ Unit No: ___\...
65,18752997-DS-9,\nName: ___ Unit No: ___\n...
66,18847983-DS-6,\nName: ___ Unit No: ___\n \n...


In [11]:
ann

Unnamed: 0,note_id,concept_id,start,end
0,10043750-DS-6,609328004,180,198
1,10043750-DS-6,419511003,201,223
2,10043750-DS-6,363406005,263,275
3,10043750-DS-6,359571009,321,336
4,10043750-DS-6,737492002,393,399
...,...,...,...,...
23229,19297319-DS-11,711133000,12046,12052
23230,19297319-DS-11,116865006,12057,12064
23231,19297319-DS-11,182809008,12157,12172
23232,19297319-DS-11,182837001,12190,12197


In [12]:
import os

import pandas as pd


def create_mapping_test_set():
    """
    SNOMED-CT Entity Challenge의 train/test 데이터를 모두 로드하고 병합하여
    매핑 모듈 테스트를 위한 최종 데이터프레임을 생성합니다.

    최종 컬럼: [note_id, note_text, entity_name, entity_code]
    """

    # 1. 파일 경로 정의 (MapOMOP 루트 기준)
    base_path = "/home/work/skku/hyo/MapOMOP/data/snomed-ct-entity-challenge"
    files = {
        "train_notes": os.path.join(base_path, "mimic-iv_notes_training_set.csv"),
        "train_annots": os.path.join(base_path, "train_annotations.csv"),
        "test_notes": os.path.join(base_path, "mimic-iv_notes_test_set.csv"),
        "test_annots": os.path.join(base_path, "test_annotations.csv"),
    }

    print("--- 1. 원본 데이터 로드 중 ---")
    try:
        # 각 CSV 파일 로드
        train_notes_df = pd.read_csv(files["train_notes"])
        train_annots_df = pd.read_csv(files["train_annots"])
        test_notes_df = pd.read_csv(files["test_notes"])
        test_annots_df = pd.read_csv(files["test_annots"])

        print(
            f"로드 완료: Train (Notes: {len(train_notes_df)}, Annots: {len(train_annots_df)})"
        )
        print(
            f"로드 완료: Test (Notes: {len(test_notes_df)}, Annots: {len(test_annots_df)})"
        )

    except FileNotFoundError as e:
        print(f" [오류] 파일을 찾을 수 없습니다. ")
        print(f"경로를 확인하세요: {e.filename}")
        print("이 스크립트를 'MapOMOP' 루트 디렉터리에서 실행해야 합니다.")
        return None
    except Exception as e:
        print(f"파일 로드 중 오류 발생: {e}")
        return None

    # 2. Train / Test 데이터셋 하나로 합치기
    print("\n--- 2. Train/Test 데이터 병합 ---")
    all_notes_df = pd.concat([train_notes_df, test_notes_df], ignore_index=True)
    all_annots_df = pd.concat([train_annots_df, test_annots_df], ignore_index=True)

    # (중요) note_id 타입을 통일 (merge 시 오류 방지)
    all_notes_df["note_id"] = all_notes_df["note_id"].astype(str)
    all_annots_df["note_id"] = all_annots_df["note_id"].astype(str)

    # 중복 노트 제거 (필요시)
    all_notes_df = all_notes_df.drop_duplicates(subset=["note_id"])

    print(f"총 노트 수 (고유): {len(all_notes_df)}")
    print(f"총 어노테이션 수: {len(all_annots_df)}")

    # 3. 노트와 어노테이션 병합
    print("\n--- 3. 노트와 어노테이션 병합 (on='note_id') ---")
    # 'inner' 조인을 통해 어노테이션이 있는 노트만 남깁니다.
    merged_df = pd.merge(all_notes_df, all_annots_df, on="note_id", how="inner")
    print(f"병합된 총 어노테이션 수: {len(merged_df)}")

    # 4. 'entity_name' 추출 (텍스트 스팬)
    print("\n--- 4. 'entity_name' (텍스트 스팬) 추출 중 ---")

    def extract_entity_name(row):
        """row['text']에서 [start:end] 범위의 텍스트를 추출합니다."""
        try:
            return row["text"][row["start"] : row["end"]]
        except TypeError:
            # text가 NaN이거나 start/end가 숫자가 아닐 경우 대비
            return None
        except IndexError:
            # start/end 인덱스가 텍스트 범위를 벗어날 경우 대비
            print(
                f"인덱스 오류: note_id={row['note_id']}, start={row['start']}, end={row['end']}"
            )
            return None

    # .apply()를 사용하여 각 행에 함수 적용
    merged_df["entity_name"] = merged_df.apply(extract_entity_name, axis=1)

    # 5. 최종 데이터프레임 정리
    print("\n--- 5. 최종 데이터프레임 생성 ---")

    # 컬럼 이름 변경 (요청사항에 맞게)
    final_df = merged_df.rename(
        columns={"text": "note_text", "concept_id": "entity_code"}
    )

    # 최종적으로 필요한 컬럼만 선택 및 순서 변경
    required_columns = ["note_id", "note_text", "entity_name", "entity_code"]
    final_test_set_df = final_df[required_columns]

    print(f"성공! 최종 테스트셋 생성 완료. (총 {len(final_test_set_df)}건)")

    return final_test_set_df


# --- 스크립트 실행 ---
if __name__ == "__main__":

    # 함수를 실행하여 데이터프레임 생성
    mapping_test_df = create_mapping_test_set()

    if mapping_test_df is not None:
        print("\n\n--- 최종 데이터프레임 (상위 5개) ---")
        print(mapping_test_df.head())

--- 1. 원본 데이터 로드 중 ---
로드 완료: Train (Notes: 204, Annots: 51574)
로드 완료: Test (Notes: 68, Annots: 23234)

--- 2. Train/Test 데이터 병합 ---
총 노트 수 (고유): 272
총 어노테이션 수: 74808

--- 3. 노트와 어노테이션 병합 (on='note_id') ---
병합된 총 어노테이션 수: 74808

--- 4. 'entity_name' (텍스트 스팬) 추출 중 ---

--- 5. 최종 데이터프레임 생성 ---
성공! 최종 테스트셋 생성 완료. (총 74808건)


--- 최종 데이터프레임 (상위 5개) ---
         note_id                                          note_text  \
0  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
1  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
2  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
3  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
4  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   

                    entity_name  entity_code  
0                   Penicillins     91936005  
1          Biliary pancreatitis     95563007  
2  Laparoscopic cholecystectomy     45595009  
3        biliary \npancreatitis   

In [13]:
mapping_test_df

Unnamed: 0,note_id,note_text,entity_name,entity_code
0,10060142-DS-9,\nName: ___ Unit No: ___\...,Penicillins,91936005
1,10060142-DS-9,\nName: ___ Unit No: ___\...,Biliary pancreatitis,95563007
2,10060142-DS-9,\nName: ___ Unit No: ___\...,Laparoscopic cholecystectomy,45595009
3,10060142-DS-9,\nName: ___ Unit No: ___\...,biliary \npancreatitis,95563007
4,10060142-DS-9,\nName: ___ Unit No: ___\...,pancreatic necrosis,1835003
...,...,...,...,...
74803,19297319-DS-11,\nName: ___ Unit No: ___\n...,fluids,711133000
74804,19297319-DS-11,\nName: ___ Unit No: ___\n...,albumin,116865006
74805,19297319-DS-11,\nName: ___ Unit No: ___\n...,kidney function,182809008
74806,19297319-DS-11,\nName: ___ Unit No: ___\n...,restart,182837001


In [31]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

from elasticsearch import Elasticsearch


def map_snomed_to_omop_es(input_df: pd.DataFrame) -> pd.DataFrame:
    """
    DataFrame의 'entity_code'를 사용하여 Elasticsearch 'concept-small' 인덱스를 조회하고,
    OMOP concept_id, concept_name, domain_id를 병합합니다.

    Args:
        input_df (pd.DataFrame): 'note_id', 'note_text', 'entity_name', 'entity_code' 포함

    Returns:
        pd.DataFrame: 'concept_id', 'concept_name', 'domain_id'가 추가된 DataFrame
    """

    # 2. 고유한 entity_code 추출 (ES 조회의 효율성을 위해)
    # 'entity_code'를 문자열로 변환 (ES term 쿼리 및 merge를 위해)
    try:
        input_df["entity_code"] = input_df["entity_code"].astype(str)
    except Exception as e:
        print(f"[오류] 'entity_code'를 문자열로 변환 중 오류: {e}")
        # NaN이나 비정상 값이 있는지 확인 필요
        input_df = input_df.dropna(subset=["entity_code"])
        input_df["entity_code"] = input_df["entity_code"].astype(int).astype(str)

    unique_codes = input_df["entity_code"].unique()
    print(f"\n--- 2. 고유 entity_code 추출 완료 ---")
    print(f"원본 {len(input_df)}건 -> 고유 코드 {len(unique_codes)}건")

    # 3. ES 조회 및 매핑 테이블 생성
    print(f"\n--- 3. ES 인덱스 'concept-small' 조회 중 ---")
    snomed_to_omop_map = {}

    for code in tqdm(unique_codes, desc="Mapping SNOMED codes"):
        # ES 쿼리 정의
        query_body = {
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"vocabulary_id": "SNOMED"}},
                        {"term": {"concept_code": code}},
                        # 'concept_code'가 'keyword' 타입으로 매핑되어 있어야 함
                    ]
                }
            },
            "_source": [
                "concept_id",
                "concept_name",
                "domain_id",
                "standard_concept",
            ],  # 표준 여부도 확인
            "size": 1,  # concept_code + vocabulary_id는 고유해야 함
        }

        try:
            response = es_client.search(index="concept-small", body=query_body)
            hits = response["hits"]["hits"]

            if hits:
                # 직접 매핑된 컨셉 (Standard 또는 Non-Standard일 수 있음)
                source_concept = hits[0]["_source"]

                # 여기서는 사용자의 요청(직접 조회)을 그대로 따릅니다.
                snomed_to_omop_map[code] = {
                    "concept_id": source_concept.get("concept_id"),
                    "concept_name": source_concept.get("concept_name"),
                    "domain_id": source_concept.get("domain_id"),
                }
            else:
                # ES에서 코드를 찾지 못한 경우
                snomed_to_omop_map[code] = {
                    "concept_id": None,
                    "concept_name": None,
                    "domain_id": None,
                }
        except Exception as e:
            print(f"ES 조회 중 오류 (Code: {code}): {e}")
            snomed_to_omop_map[code] = {
                "concept_id": None,
                "concept_name": None,
                "domain_id": None,
            }

    print("ES 조회 및 매핑 테이블 생성 완료.")

    # 4. 매핑 딕셔너리를 DataFrame으로 변환
    print("\n--- 4. 매핑 테이블 -> DataFrame 변환 ---")
    omop_df = pd.DataFrame.from_dict(snomed_to_omop_map, orient="index")
    omop_df = omop_df.reset_index().rename(columns={"index": "entity_code"})

    # entity_code를 merge를 위해 문자열 타입으로 유지
    omop_df["entity_code"] = omop_df["entity_code"].astype(str)

    # 5. 원본 DataFrame과 매핑된 OMOP 정보 병합
    print("\n--- 5. 원본 DataFrame과 병합 (Left Join) ---")
    final_df = pd.merge(input_df, omop_df, on="entity_code", how="left")

    print("병합 완료! 최종 데이터프레임 생성.")
    return final_df


# --- 스크립트 실행 ---
if __name__ == "__main__":
    if "mapping_test_df" in locals() or "mapping_test_df" in globals():
        print("--- 'mapping_test_df' 변수 확인됨 ---")
        final_mapped_df = map_snomed_to_omop_es(mapping_test_df)

        print("\n\n--- 최종 매핑 결과 (상위 10개) ---")
        print(final_mapped_df.head(10))

        print("\n\n--- 매핑 실패 (None) 건수 확인 ---")
        print(
            final_mapped_df["concept_id"].isna().sum(),
            "건의 매핑 실패 (ES에서 찾을 수 없음)",
        )

    else:
        print("[주의] 'mapping_test_df' 변수가 존재하지 않습니다.")
        print("이 스크립트를 단독으로 실행한 경우, 테스트용 더미 데이터를 사용하거나")
        print("이전 단계의 'create_mapping_test_set()' 함수를 먼저 실행해야 합니다.")

--- 'mapping_test_df' 변수 확인됨 ---

--- 2. 고유 entity_code 추출 완료 ---
원본 74808건 -> 고유 코드 6624건

--- 3. ES 인덱스 'concept-small' 조회 중 ---


Mapping SNOMED codes:   0%|          | 0/6624 [00:00<?, ?it/s]

Mapping SNOMED codes: 100%|██████████| 6624/6624 [00:47<00:00, 139.79it/s]

ES 조회 및 매핑 테이블 생성 완료.

--- 4. 매핑 테이블 -> DataFrame 변환 ---

--- 5. 원본 DataFrame과 병합 (Left Join) ---
병합 완료! 최종 데이터프레임 생성.


--- 최종 매핑 결과 (상위 10개) ---
         note_id                                          note_text  \
0  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
1  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
2  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
3  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
4  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
5  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
6  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
7  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
8  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   
9  10060142-DS-9   \nName:  ___                  Unit No:   ___\...   

                    entity_name entity_code concept_id  \
0            




In [32]:
final_mapped_df

Unnamed: 0,note_id,note_text,entity_name,entity_code,concept_id,concept_name,domain_id
0,10060142-DS-9,\nName: ___ Unit No: ___\...,Penicillins,91936005,4240903,allergy to penicillin,Observation
1,10060142-DS-9,\nName: ___ Unit No: ___\...,Biliary pancreatitis,95563007,4319446,gallstone pancreatitis,Condition
2,10060142-DS-9,\nName: ___ Unit No: ___\...,Laparoscopic cholecystectomy,45595009,4163971,laparoscopic cholecystectomy,Procedure
3,10060142-DS-9,\nName: ___ Unit No: ___\...,biliary \npancreatitis,95563007,4319446,gallstone pancreatitis,Condition
4,10060142-DS-9,\nName: ___ Unit No: ___\...,pancreatic necrosis,1835003,4080069,necrosis of pancreas,Condition
...,...,...,...,...,...,...,...
74803,19297319-DS-11,\nName: ___ Unit No: ___\n...,fluids,711133000,46272719,administration of fluid therapy,Procedure
74804,19297319-DS-11,\nName: ___ Unit No: ___\n...,albumin,116865006,4022175,administration of albumin,Procedure
74805,19297319-DS-11,\nName: ___ Unit No: ___\n...,kidney function,182809008,4056943,renal function monitoring,Observation
74806,19297319-DS-11,\nName: ___ Unit No: ___\n...,restart,182837001,4082385,restart of medication,Procedure


In [33]:
output_filename = "/home/work/skku/hyo/MapOMOP/data/mapping_test_snomed.csv"
final_mapped_df.to_csv(output_filename, index=False)
print(f"\n최종 파일 저장 완료: {output_filename}")


최종 파일 저장 완료: /home/work/skku/hyo/MapOMOP/data/mapping_test_snomed.csv


In [None]:
import pandas as pd

df = pd.read_csv("/home/work/skku/hyo/MapOMOP/data/mapping_test_snomed.csv")
df

Unnamed: 0,note_id,note_text,entity_name,entity_code,concept_id,concept_name,domain_id
0,10060142-DS-9,\nName: ___ Unit No: ___\...,Penicillins,91936005,4240903,allergy to penicillin,Observation
1,10060142-DS-9,\nName: ___ Unit No: ___\...,Biliary pancreatitis,95563007,4319446,gallstone pancreatitis,Condition
2,10060142-DS-9,\nName: ___ Unit No: ___\...,Laparoscopic cholecystectomy,45595009,4163971,laparoscopic cholecystectomy,Procedure
3,10060142-DS-9,\nName: ___ Unit No: ___\...,biliary \npancreatitis,95563007,4319446,gallstone pancreatitis,Condition
4,10060142-DS-9,\nName: ___ Unit No: ___\...,pancreatic necrosis,1835003,4080069,necrosis of pancreas,Condition
...,...,...,...,...,...,...,...
74803,19297319-DS-11,\nName: ___ Unit No: ___\n...,fluids,711133000,46272719,administration of fluid therapy,Procedure
74804,19297319-DS-11,\nName: ___ Unit No: ___\n...,albumin,116865006,4022175,administration of albumin,Procedure
74805,19297319-DS-11,\nName: ___ Unit No: ___\n...,kidney function,182809008,4056943,renal function monitoring,Observation
74806,19297319-DS-11,\nName: ___ Unit No: ___\n...,restart,182837001,4082385,restart of medication,Procedure


In [2]:
df = df.drop(columns=["note_text"])
df

Unnamed: 0,note_id,entity_name,entity_code,concept_id,concept_name,domain_id
0,10060142-DS-9,Penicillins,91936005,4240903,allergy to penicillin,Observation
1,10060142-DS-9,Biliary pancreatitis,95563007,4319446,gallstone pancreatitis,Condition
2,10060142-DS-9,Laparoscopic cholecystectomy,45595009,4163971,laparoscopic cholecystectomy,Procedure
3,10060142-DS-9,biliary \npancreatitis,95563007,4319446,gallstone pancreatitis,Condition
4,10060142-DS-9,pancreatic necrosis,1835003,4080069,necrosis of pancreas,Condition
...,...,...,...,...,...,...
74803,19297319-DS-11,fluids,711133000,46272719,administration of fluid therapy,Procedure
74804,19297319-DS-11,albumin,116865006,4022175,administration of albumin,Procedure
74805,19297319-DS-11,kidney function,182809008,4056943,renal function monitoring,Observation
74806,19297319-DS-11,restart,182837001,4082385,restart of medication,Procedure


In [3]:
df.to_csv(
    "/home/work/skku/hyo/MapOMOP/data/mapping_test_snomed_no_note.csv", index=False
)

## SNUH data preprocessing

In [4]:
import pandas as pd

df = pd.read_csv("/home/work/skku/hyo/omop-mapper/data/MapOMOP_test_data_sh_lens_mapomop_test_data.csv")
df

Unnamed: 0,record_count,domain_id,source_value,concept_id
0,86177788,Measurement,SBP,3004249
1,86177788,Measurement,"difficulty, weaning",3004249
2,86140151,Measurement,"difficulty, writing",3012888
3,86140151,Measurement,DBP,3012888
4,67004705,Measurement,BT,3020891
...,...,...,...,...
4224,3343,Measurement,CD4/CD8 ratio (BAL),3045450
4225,3229,Measurement,Free Testosterone,3016049
4226,3229,Measurement,Factor assay (XI),3001850
4227,3222,Measurement,"(검사중단)Kappa Free Light Chain, urine",3034597


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4229 entries, 0 to 4228
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   record_count  4229 non-null   int64 
 1   domain_id     4229 non-null   object
 2   source_value  4229 non-null   object
 3   concept_id    4229 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 132.3+ KB


In [7]:
import re

def preprocess_source_value(df):
    """
    source_value 전처리:
    1. 모두 한글이면 해당 행 삭제
    2. 한글/영문 섞여 있으면 영문만 남기고 한글 제거
    3. 영어만 있으면 유지
    4. 모두 소문자로 변환
    """
    df = df.copy()
    
    # 한글 패턴
    korean_pattern = re.compile('[가-힣]+')
    # 영문 패턴 (알파벳)
    english_pattern = re.compile('[a-zA-Z]+')
    
    def process_text(text):
        if pd.isna(text):
            return text
        
        text = str(text)
        
        has_korean = bool(korean_pattern.search(text))
        has_english = bool(english_pattern.search(text))
        
        # 모두 한글만 있으면 None 반환 (나중에 삭제)
        if has_korean and not has_english:
            return None
        
        # 한글/영문 섞여 있으면 한글 제거
        if has_korean and has_english:
            # 한글 제거
            text = korean_pattern.sub('', text)
            # 연속된 공백 정리
            text = re.sub(r'\s+', ' ', text).strip()
            # 앞뒤 특수문자 정리
            text = re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', text)
        
        return text
    
    # source_value 처리
    df['source_value'] = df['source_value'].apply(process_text)
    
    # None인 행 삭제 (모두 한글이었던 행)
    df = df.dropna(subset=['source_value'])
    
    # 빈 문자열인 행도 삭제
    df = df[df['source_value'].str.strip() != '']
    
    # 소문자 변환
    df['source_value'] = df['source_value'].str.lower()
    
    # 인덱스 리셋
    df = df.reset_index(drop=True)
    
    return df

In [8]:
df_cleaned = preprocess_source_value(df)
df_cleaned

Unnamed: 0,record_count,domain_id,source_value,concept_id
0,86177788,Measurement,sbp,3004249
1,86177788,Measurement,"difficulty, weaning",3004249
2,86140151,Measurement,"difficulty, writing",3012888
3,86140151,Measurement,dbp,3012888
4,67004705,Measurement,bt,3020891
...,...,...,...,...
4124,3343,Measurement,cd4/cd8 ratio (bal),3045450
4125,3229,Measurement,free testosterone,3016049
4126,3229,Measurement,factor assay (xi),3001850
4127,3222,Measurement,"kappa free light chain, urine",3034597


In [8]:
non_numeric_mask = pd.to_numeric(df_cleaned["omop_concept_id"], errors="coerce").isna()
non_numeric_rows = df_cleaned[non_numeric_mask]
non_numeric_rows

Unnamed: 0,snuh_id,omop_concept_id,vocabulary,source_name,domain,mappedby,mappingstatus,primarychecker,pc_dept,secondarychecker,modifieddate,note,remark
231,1007,,,memo,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
250,1037,,,cent cvp,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
252,1038,,,sg cvp,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
254,1040,,,cvp,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
255,1043,,,systolic pap,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,2441,,,static resistance,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
1109,2444,,,auto peep,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
1110,2445,,,enf,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,
1111,2446,,,cvvhdf replacement rate,Measurement,장인순,9.0,SCDM실무회의(2019.04.05),SCDM실무회의(2019.04.05),,2019-05-03 12:10:00,,


In [9]:
df_cleaned = df_cleaned[df_cleaned["omop_concept_id"].notna()]
df_cleaned["omop_concept_id"] = df_cleaned["omop_concept_id"].astype(int)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53116 entries, 0 to 53550
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   snuh_id           53116 non-null  object        
 1   omop_concept_id   53116 non-null  int64         
 2   vocabulary        53106 non-null  object        
 3   source_name       53116 non-null  object        
 4   domain            53116 non-null  object        
 5   mappedby          53116 non-null  object        
 6   mappingstatus     53116 non-null  float64       
 7   primarychecker    137 non-null    object        
 8   pc_dept           137 non-null    object        
 9   secondarychecker  65 non-null     object        
 10  modifieddate      53116 non-null  datetime64[ns]
 11  note              13350 non-null  object        
 12  remark            31317 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 5.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['omop_concept_id'] = df_cleaned['omop_concept_id'].astype(int)


In [11]:
df_cleaned = df_cleaned[df_cleaned["snuh_id"].notna()]
df_cleaned["snuh_id"] = df_cleaned["snuh_id"].astype(int)
df_cleaned

Unnamed: 0,snuh_id,omop_concept_id,vocabulary,source_name,domain,mappedby,mappingstatus,primarychecker,pc_dept,secondarychecker,modifieddate,note,remark
0,115,4240605,SNOMED,abortion,Condition,분당,1.0,,,,2019-07-29 15:31:00.000,,
1,124,4219871,SNOMED,abrasion,Condition,백승희,1.0,,,,2018-11-14 11:04:00.000,,
2,127,444202,SNOMED,abscess,Condition,분당,1.0,,,,2019-07-29 15:31:00.000,,
3,161,4199933,SNOMED,academic deterioration,Observation,분당,1.0,,,,2019-07-29 15:31:00.000,,
4,162,4091060,SNOMED,acalculia,Condition,분당,1.0,,,,2019-07-29 15:31:00.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53546,20006364,4178668,SNOMED,branchial arch anomaly,Condition,박혜진,1.0,,,,2021-08-26 10:52:35.093,기존4294415.동일진단명의 통합코드D00016463의 concept id로 매핑,요청84
53547,20006364,45582639,ICD10,branchial arch anomaly,Condition,박혜진,1.0,,,,2020-08-27 12:34:26.459,,요청61
53548,20006728,4032774,SNOMED,medical thoracoscopy,Procedure,박혜진,1.0,,,,2020-10-28 10:37:17.394,수술코드 마스터파일의 concept id로 매핑,요청64
53549,20006729,4094651,SNOMED,medical thoracoscopic biopsy,Procedure,박혜진,1.0,,,,2020-10-28 10:37:17.394,수술코드 마스터파일의 concept id로 매핑,요청64


In [12]:
df_cleaned = df_cleaned.drop(
    columns=[
        "mappedby",
        "mappingstatus",
        "primarychecker",
        "pc_dept",
        "secondarychecker",
        "modifieddate",
        "note",
        "remark",
    ]
)
df_cleaned

Unnamed: 0,snuh_id,omop_concept_id,vocabulary,source_name,domain
0,115,4240605,SNOMED,abortion,Condition
1,124,4219871,SNOMED,abrasion,Condition
2,127,444202,SNOMED,abscess,Condition
3,161,4199933,SNOMED,academic deterioration,Observation
4,162,4091060,SNOMED,acalculia,Condition
...,...,...,...,...,...
53546,20006364,4178668,SNOMED,branchial arch anomaly,Condition
53547,20006364,45582639,ICD10,branchial arch anomaly,Condition
53548,20006728,4032774,SNOMED,medical thoracoscopy,Procedure
53549,20006729,4094651,SNOMED,medical thoracoscopic biopsy,Procedure


In [13]:
df_cleaned.to_csv("/home/work/skku/hyo/MapOMOP/data/mapping_test_snuh.csv", index=False)

In [1]:
import pandas as pd

df_cleaned = pd.read_csv("/home/work/skku/hyo/MapOMOP/data/mapping_test_snuh.csv")
df_cleaned

Unnamed: 0,snuh_id,omop_concept_id,vocabulary,source_name,domain
0,115,4240605,SNOMED,abortion,Condition
1,124,4219871,SNOMED,abrasion,Condition
2,127,444202,SNOMED,abscess,Condition
3,161,4199933,SNOMED,academic deterioration,Observation
4,162,4091060,SNOMED,acalculia,Condition
...,...,...,...,...,...
53111,20006364,4178668,SNOMED,branchial arch anomaly,Condition
53112,20006364,45582639,ICD10,branchial arch anomaly,Condition
53113,20006728,4032774,SNOMED,medical thoracoscopy,Procedure
53114,20006729,4094651,SNOMED,medical thoracoscopic biopsy,Procedure


In [2]:
df_cleaned["domain"].value_counts()

domain
Condition       45900
Procedure        6637
Observation       459
Measurement       112
Device              7
Relationship        1
Name: count, dtype: int64

In [3]:
import pandas as pd

df = pd.read_csv("/home/work/skku/hyo/MapOMOP/data/mapping_test_snomed_no_note.csv")
df

Unnamed: 0,note_id,entity_name,entity_code,concept_id,concept_name,domain_id
0,10060142-DS-9,Penicillins,91936005,4240903,allergy to penicillin,Observation
1,10060142-DS-9,Biliary pancreatitis,95563007,4319446,gallstone pancreatitis,Condition
2,10060142-DS-9,Laparoscopic cholecystectomy,45595009,4163971,laparoscopic cholecystectomy,Procedure
3,10060142-DS-9,biliary \npancreatitis,95563007,4319446,gallstone pancreatitis,Condition
4,10060142-DS-9,pancreatic necrosis,1835003,4080069,necrosis of pancreas,Condition
...,...,...,...,...,...,...
74803,19297319-DS-11,fluids,711133000,46272719,administration of fluid therapy,Procedure
74804,19297319-DS-11,albumin,116865006,4022175,administration of albumin,Procedure
74805,19297319-DS-11,kidney function,182809008,4056943,renal function monitoring,Observation
74806,19297319-DS-11,restart,182837001,4082385,restart of medication,Procedure


In [4]:
df["domain_id"].value_counts()

domain_id
Condition             30813
Procedure             12708
Measurement           12423
Observation           11811
Spec Anatomic Site     7053
Name: count, dtype: int64

In [5]:
top_10000_concepts_ids = df_cleaned["omop_concept_id"].value_counts().head(10000).index
df_top_10000 = df_cleaned[
    df_cleaned["omop_concept_id"].isin(top_10000_concepts_ids)
].copy()

print(f"상위 10,000개 OMOP Concept ID에 해당하는 총 행의 수: {len(df_top_10000)}")
print("새로 저장된 데이터프레임의 상위 5개 행:")
print(df_top_10000.head())

상위 10,000개 OMOP Concept ID에 해당하는 총 행의 수: 41572
새로 저장된 데이터프레임의 상위 5개 행:
   snuh_id  omop_concept_id vocabulary       source_name     domain
0      115          4240605     SNOMED          abortion  Condition
1      124          4219871     SNOMED          abrasion  Condition
2      127           444202     SNOMED           abscess  Condition
4      162          4091060     SNOMED         acalculia  Condition
5      163          4056354     SNOMED  accessory breast  Condition


In [6]:
df_top_10k_unique = df_top_10000.drop_duplicates(
    subset=["omop_concept_id"], keep="first"
).copy()

# 2. 결과 확인
print(f"중복 제거 후 최종 행의 수: {len(df_top_10k_unique)}")
print("새로 저장된 데이터프레임의 상위 5개 행:")
print(df_top_10k_unique.head())

중복 제거 후 최종 행의 수: 10000
새로 저장된 데이터프레임의 상위 5개 행:
   snuh_id  omop_concept_id vocabulary       source_name     domain
0      115          4240605     SNOMED          abortion  Condition
1      124          4219871     SNOMED          abrasion  Condition
2      127           444202     SNOMED           abscess  Condition
4      162          4091060     SNOMED         acalculia  Condition
5      163          4056354     SNOMED  accessory breast  Condition


In [7]:
# 'omop_concept_id'별 빈도수(Count)를 계산합니다.
concept_counts = df_cleaned["omop_concept_id"].value_counts().reset_index()
concept_counts.columns = ["omop_concept_id", "count"]

# df_top_10k_unique 데이터프레임에 빈도수 정보를 병합합니다.
df_sorted = pd.merge(
    df_top_10k_unique, concept_counts, on="omop_concept_id", how="left"
)

# 'count' 열을 기준으로 내림차순 정렬합니다.
df_sorted = df_sorted.sort_values(by="count", ascending=False)

# 결과 확인
print("빈도수 순으로 명확하게 정렬된 데이터프레임의 상위 5개 행:")
print(df_sorted.head())

빈도수 순으로 명확하게 정렬된 데이터프레임의 상위 5개 행:
      snuh_id  omop_concept_id vocabulary  \
3554    71571         45542614      ICD10   
781      5023           432851     SNOMED   
3237    71066         45586089      ICD10   
6029    76782         45538401      ICD10   
3252    71082         45542611      ICD10   

                              source_name     domain  count  
3554                       ovarian cancer  Condition    157  
781                      krukenberg tumor  Condition    133  
3237                         glomus tumor  Condition    110  
6029         unruptured cerebral aneurysm  Condition    106  
3252  connective and soft tissue neoplasm  Condition    103  


In [8]:
df_sorted

Unnamed: 0,snuh_id,omop_concept_id,vocabulary,source_name,domain,count
3554,71571,45542614,ICD10,ovarian cancer,Condition,157
781,5023,432851,SNOMED,krukenberg tumor,Condition,133
3237,71066,45586089,ICD10,glomus tumor,Condition,110
6029,76782,45538401,ICD10,unruptured cerebral aneurysm,Condition,106
3252,71082,45542611,ICD10,connective and soft tissue neoplasm,Condition,103
...,...,...,...,...,...,...
9989,20006024,435696,SNOMED,other early complications of trauma,Condition,2
9990,20006033,45594882,ICD10,rh incompatibility reaction,Condition,2
9991,20006033,437195,SNOMED,rh incompatibility reaction,Condition,2
3,162,4091060,SNOMED,acalculia,Condition,2


In [9]:
df_sorted.to_csv(
    "/home/work/skku/hyo/MapOMOP/data/mapping_test_snuh_top10k.csv", index=False
)

In [2]:
import pandas as pd

relation = pd.read_csv(
    "/home/work/skku/hyo/MapOMOP/data/omop-cdm/CONCEPT_RELATIONSHIP.csv", sep="\t"
)
relation

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,42073232,42073232,Mapped from,20190221,20991231,
1,42073232,42073232,Maps to,20190221,20991231,
2,42073236,42073236,Mapped from,20190221,20991231,
3,42073236,42073236,Maps to,20190221,20991231,
4,42073241,42073241,Mapped from,20190221,20991231,
...,...,...,...,...,...,...
54749163,42073208,42073208,Maps to,20190221,20991231,
54749164,42073211,42073211,Mapped from,20190221,20991231,
54749165,42073211,42073211,Maps to,20190221,20991231,
54749166,42073223,42073223,Mapped from,20190221,20991231,


In [12]:
relation["relationship_id"].unique()

array(['Mapped from', 'Maps to', 'Is a', 'Subsumes',
       'RxNorm has dose form', 'Status of', 'RxNorm inverse is a',
       'Tradename of', 'Drug has drug class', 'Has tradename',
       'Marketed form of', 'Consists of', 'Has supplier',
       'Has marketed form', 'Has quantified form', 'Available as box',
       'RxNorm is a', 'RxNorm - ATC', 'Concept replaces',
       'RxNorm has ing', 'Has brand name', 'Quantified form of',
       'Constitutes', 'Module of', 'Box of', 'Contained in',
       'Brand name of', 'Maps to value', 'SPL - RxNorm',
       'Histology of ICDO', 'Has Histology ICDO', 'Has Topography ICDO',
       'Has finding site', 'ICDO to Schema', 'Has asso morph',
       'ICDO to Chapter', 'Has kind', 'Has property', 'Has time aspect',
       'Has subject matter', 'Has scale type', 'Has role',
       'Has component', 'Has system', 'Has method', 'Has type of service',
       'Panel contains', 'Has Answer', 'Has setting',
       'Contained in panel', 'System of', 'Compone

In [9]:
relation["relationship_id"].value_counts().head(10)

relationship_id
Mapped from             7299489
Maps to                 7299489
Is a                    3441303
Subsumes                3441303
Marketed form of        2020882
Has marketed form       2020882
RxNorm has dose form    1797987
RxNorm dose form of     1797987
Has brand name          1478520
Brand name of           1478520
Name: count, dtype: int64

In [1]:
import pandas as pd

ancestor = pd.read_csv(
    "/home/work/skku/hyo/MapOMOP/data/omop-cdm/CONCEPT_ANCESTOR.csv", sep="\t"
)
ancestor

Unnamed: 0,ancestor_concept_id,descendant_concept_id,min_levels_of_separation,max_levels_of_separation
0,2723957,2724035,4,4
1,2801516,2880942,5,5
2,2801516,2854897,5,5
3,2801516,2752144,6,6
4,2801516,2835202,5,5
...,...,...,...,...
85502413,2804005,2756199,5,5
85502414,1979971,44089940,2,2
85502415,2053871,2028157,0,0
85502416,2809473,2788400,3,3


In [2]:
from transformers import AutoModel, AutoTokenizer

# 1. Hugging Face에서 모델 및 토크나이저 불러오기
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 2. 로컬 디렉토리에 저장 (이 폴더를 나중에 병원망으로 옮깁니다)
save_path = "./sapbert_model_files"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

print(f"모델 파일이 {save_path}에 저장되었습니다.")

  from .autonotebook import tqdm as notebook_tqdm


모델 파일이 ./sapbert_model_files에 저장되었습니다.
