In [10]:
import os
ES_SERVER_HOST = os.getenv("ES_SERVER_HOST", "3.35.110.161")
ES_SERVER_PORT = int(os.getenv("ES_SERVER_PORT", "9200"))
ES_SERVER_USERNAME = os.getenv("ES_SERVER_USERNAME", "elastic")
ES_SERVER_PASSWORD = os.getenv("ES_SERVER_PASSWORD", "snomed")

from elasticsearch import Elasticsearch

# ES 8.x 문법 (기본 권장)
es = Elasticsearch(
    f"http://{ES_SERVER_HOST}:{ES_SERVER_PORT}",
    basic_auth=(ES_SERVER_USERNAME, ES_SERVER_PASSWORD),
    request_timeout=60,
)

In [11]:
# 연결 확인
info = es.info()
print("Cluster name:", info.get('cluster_name'))
print("Version:", info.get('version', {}).get('number'))

# 인덱스 확인
indices = sorted(es.indices.get(index="*").keys())
print(f"\n총 인덱스 수: {len(indices)}")

Cluster name: docker-cluster
Version: 9.0.0

총 인덱스 수: 3


In [12]:
import pandas as pd

rows = es.cat.indices(format="json", s="index", bytes="mb")
df = pd.DataFrame(rows)[["index", "pri", "rep", "docs.count", "store.size"]]
df.rename(columns={"index":"index_name", 
                     "pri":"shards", 
                     "rep":"replicas",
                     "docs.count":"docs", 
                     "store.size":"store_mb"}, inplace=True)
df

Unnamed: 0,index_name,shards,replicas,docs,store_mb
0,concept-relationship,3,5,54749168,5358
1,concept-small,3,5,9922324,151731
2,concept-synonym,3,5,3937455,60894


In [4]:
# concepts 인덱스에서 첫번째 문서 확인
doc = es.get(index="concept-small", id=4030206)
doc["_source"]

{'concept_id': '4030206',
 'concept_name': "adrenal cushing's syndrome",
 'domain_id': 'Condition',
 'vocabulary_id': 'SNOMED',
 'concept_class_id': 'Disorder',
 'standard_concept': 'S',
 'concept_code': '237735008',
 'valid_start_date': '20020131',
 'valid_end_date': '20991231',
 'invalid_reason': None,
 'concept_embedding': [-0.34629061818122864,
  0.22591178119182587,
  -0.295034259557724,
  -1.0008443593978882,
  -0.9234474897384644,
  -0.0806569755077362,
  -0.21006150543689728,
  1.235754370689392,
  0.4572747051715851,
  0.08569078147411346,
  -0.05736774206161499,
  -0.5105459690093994,
  -0.1300671547651291,
  -0.2243945598602295,
  -0.47067737579345703,
  0.8610857725143433,
  1.3031164407730103,
  0.05049440637230873,
  -0.20178045332431793,
  -0.2509421706199646,
  -0.2773750126361847,
  -0.40466660261154175,
  -0.7616651058197021,
  -0.845181405544281,
  -0.7562867403030396,
  0.03564050421118736,
  -0.43268564343452454,
  -0.7261186838150024,
  -0.20699049532413483,
  -0.

In [4]:
import pandas as pd

df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT.csv', sep='\t')
df.head()

  df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT.csv', sep='\t')


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [4]:
len(df)

9922358

In [12]:
df[df['concept_name'] == 'Mineralocorticoid Receptor Antagonists']

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
2283216,19388264,Mineralocorticoid Receptor Antagonists,Drug,MeSH,Main Heading,,D000451,20230501,20991231,
3704223,4324562,Mineralocorticoid Receptor Antagonists,Drug,NDFRT,Mechanism of Action,,N0000000139,19700101,20991231,


In [7]:
df[df['concept_name'] == 'April 1, 2025']

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason


In [6]:
# 도메인별 데이터 수 계산
domain_counts = df['domain_id'].value_counts()

# 결과 출력
print("도메인별 데이터 수:")
for domain, count in domain_counts.items():
    print(f"{domain}: {count:,}개")

도메인별 데이터 수:
Drug: 4,772,484개
Observation: 376,134개
Condition: 268,040개
Device: 237,173개
Geography: 204,037개
Measurement: 190,196개
Procedure: 101,081개
Spec Anatomic Site: 41,129개
Meas Value: 25,681개
Metadata: 4,820개
Note: 4,153개
Type Concept: 3,807개
Unit: 2,490개
Specimen: 2,096개
Provider: 1,562개
Language: 879개
Visit: 718개
Revenue Code: 545개
Race: 519개
Relationship: 419개
Route: 218개
Currency: 180개
Payer: 168개
Cost: 51개
Condition Status: 22개
Episode: 18개
Gender: 15개
Plan Stop Reason: 13개
Plan: 11개
Condition/Meas: 8개
Meas Value Operator: 7개
Sponsor: 6개
Ethnicity: 2개
Condition/Device: 1개


In [4]:
from elasticsearch import Elasticsearch

# --- Elasticsearch 연결 정보 수정 ---
ES_HOST = "http://3.35.110.161:9200"
ES_USER = "elastic"
ES_PASSWORD = "snomed"
# ------------------------------------

try:
    # Elasticsearch 클라이언트 생성
    es = Elasticsearch(
        [ES_HOST],
        basic_auth=(ES_USER, ES_PASSWORD),
        request_timeout=30
    )

    # 클러스터의 모든 노드 정보 가져오기
    # 'settings'와 'roles' 정보만 필터링하여 요청
    nodes_info = es.nodes.info(metric=["settings", "roles"])

    print("--- Cluster Node Roles ---")

    ml_node_found = False
    for node_id, info in nodes_info["nodes"].items():
        node_name = info["name"]
        roles = info["roles"]
        
        print(f"Node Name: {node_name}")
        print(f"  - Roles: {roles}")
        
        if "ml" in roles:
            ml_node_found = True
            print("  - ✅ This node is an ML node.")
        else:
            print("  - ❌ This node is NOT an ML node.")
        print("-" * 20)

    if not ml_node_found:
        print("\n[CRITICAL] No ML nodes found in the cluster!")
        print("Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it.")
    else:
        print("\n[INFO] ML node(s) found in the cluster.")

except Exception as e:
    print(f"An error occurred: {e}")

--- Cluster Node Roles ---

[CRITICAL] No ML nodes found in the cluster!
Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it.


In [13]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

# replace with your own list of entity names
all_names = ["glucocorticoid-remediable aldosteronism", "familial hyperaldosteronism type 1", "hypercortisolism", "primary aldosteronism", "idiopathic hyperaldosteronism"] 

bs = 128 # batch size during inference
all_embs = []
for i in tqdm(np.arange(0, len(all_names), bs)):
    toks = tokenizer.batch_encode_plus(all_names[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    all_embs.append(cls_rep.cpu().detach().numpy())

all_embs = np.concatenate(all_embs, axis=0)

100%|██████████| 1/1 [00:00<00:00, 53.15it/s]


In [14]:
all_embs

array([[-0.13947393,  0.43695697,  0.08738231, ..., -1.0600768 ,
         1.3542985 ,  0.28444815],
       [-0.23367757, -0.27389506, -0.7405265 , ..., -1.0965482 ,
         0.8681837 ,  0.24001615],
       [ 0.12354765, -0.00634034, -0.2362405 , ...,  0.20356396,
         0.36005285,  0.10963675],
       [-0.01222848,  0.16024525, -0.05518519, ..., -0.902581  ,
         1.2322679 , -0.00839768],
       [-0.24608356,  0.23881717, -0.48191535, ..., -0.8994233 ,
         0.68611145,  0.301503  ]], shape=(5, 768), dtype=float32)

In [15]:
from numpy import dot
from numpy.linalg import norm

base = all_embs[0]
other_embs = all_embs[1:]

cos_similarities = []
for emb in other_embs:
    sim = dot(base, emb) / (norm(base) * norm(emb))
    cos_similarities.append(sim)

cos_similarities  # 리스트 형태로 출력

[np.float32(0.7941228),
 np.float32(0.53870785),
 np.float32(0.8737862),
 np.float32(0.79532963)]

In [4]:
def normalize_text_for_similarity(text: str) -> str:
    import re
    # 하이픈, 언더스코어를 공백으로 변환
    text = text.replace('-', ' ').replace('_', ' ')
    # 연속된 공백을 하나로 통합
    text = re.sub(r'\s+', ' ', text)
    # 앞뒤 공백 제거
    text = text.strip()
    return text

In [5]:
def get_ngrams(text: str, n: int = 3) -> set:
    if len(text) < n:
        return {text}
    
    ngrams = set()
    for i in range(len(text) - n + 1):
        ngrams.add(text[i:i + n])
    
    return ngrams

In [6]:
def calculate_text_similarity(entity_name: str, concept_name: str) -> float:
    if not entity_name or not concept_name:
        return 0.0
    
    # 대소문자 정규화
    entity_name_lower = entity_name.lower()
    concept_name_lower = concept_name.lower()

    # 특수문자 정규화 (하이픈, 언더스코어를 공백으로)
    entity_name_normalized = normalize_text_for_similarity(entity_name_lower)
    concept_name_normalized = normalize_text_for_similarity(concept_name_lower)
    
    # N-gram 3으로 분할
    entity_ngrams = get_ngrams(entity_name_normalized, n=3)
    concept_ngrams = get_ngrams(concept_name_normalized, n=3)
    
    if not entity_ngrams or not concept_ngrams:
        return 0.0
    
    # concept_name과의 Jaccard 유사도 계산
    intersection = entity_ngrams.intersection(concept_ngrams)
    union = entity_ngrams.union(concept_ngrams)
    similarity = len(intersection) / len(union) if union else 0.0
    
    return similarity

In [7]:
calculate_text_similarity("glucagon-like peptide-1", "glucagon-like peptide 1")

1.0

In [5]:
import pandas as pd

df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT_RELATIONSHIP.csv', sep='\t')
df.head()

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,42073232,42073232,Mapped from,20190221,20991231,
1,42073232,42073232,Maps to,20190221,20991231,
2,42073236,42073236,Mapped from,20190221,20991231,
3,42073236,42073236,Maps to,20190221,20991231,
4,42073241,42073241,Mapped from,20190221,20991231,


In [8]:
df[df['relationship_id'] == 'Is a']

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
14,42073277,42056272,Is a,19700101,20991231,
47,42073397,42057535,Is a,19700101,20991231,
56,42073422,42056834,Is a,19700101,20991231,
59,42073426,42057234,Is a,19700101,20991231,
118,42073690,42055362,Is a,19700101,20991231,
...,...,...,...,...,...,...
54749065,42072826,42056673,Is a,19700101,20991231,
54749076,42072872,42056762,Is a,19700101,20991231,
54749101,42072963,42054154,Is a,19700101,20991231,
54749126,42073075,42055713,Is a,19700101,20991231,


In [3]:
len(df)

54749168

In [9]:
import pandas as pd

df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT_SYNONYM.csv', sep='\t')
df.head()

Unnamed: 0,concept_id,concept_synonym_name,language_concept_id
0,3019950,CELL MARKERS; Immune globulin A; Immunoglobuli...,4180186
1,3019950,Lymphs IgA/lymph NFr Spec,4180186
2,3019951,Finding; Findings; nursing; Ordinal; Point in ...,4180186
3,3019955,pCO2 Bld Postductal TcCo2,4180186
4,3019955,Bld Postductal; Bld.foot; BldA.foot; Blood; Ca...,4180186
