In [1]:
import os
ES_SERVER_HOST = os.getenv("ES_SERVER_HOST", "3.35.110.161")
ES_SERVER_PORT = int(os.getenv("ES_SERVER_PORT", "9200"))
ES_SERVER_USERNAME = os.getenv("ES_SERVER_USERNAME", "elastic")
ES_SERVER_PASSWORD = os.getenv("ES_SERVER_PASSWORD", "snomed")

from elasticsearch import Elasticsearch

# ES 8.x 문법 (기본 권장)
es = Elasticsearch(
    f"http://{ES_SERVER_HOST}:{ES_SERVER_PORT}",
    basic_auth=(ES_SERVER_USERNAME, ES_SERVER_PASSWORD),
    request_timeout=60,
)

In [2]:
# 연결 확인
info = es.info()
print("Cluster name:", info.get('cluster_name'))
print("Version:", info.get('version', {}).get('number'))

# 인덱스 확인
indices = sorted(es.indices.get(index="*").keys())
print(f"\n총 인덱스 수: {len(indices)}")

Cluster name: docker-cluster
Version: 9.0.0

총 인덱스 수: 3


In [3]:
import pandas as pd

rows = es.cat.indices(format="json", s="index", bytes="mb")
df = pd.DataFrame(rows)[["index", "pri", "rep", "docs.count", "store.size"]]
df.rename(columns={"index":"index_name", 
                     "pri":"shards", 
                     "rep":"replicas",
                     "docs.count":"docs", 
                     "store.size":"store_mb"}, inplace=True)
df

Unnamed: 0,index_name,shards,replicas,docs,store_mb
0,concept,3,5,80000,1231
1,concept-relationship,1,0,38924726,2937
2,concept-small,3,5,65564,1010


In [6]:
# concepts 인덱스에서 첫번째 문서 확인
doc = es.get(index="concept", id=4030206)
doc["_source"]

{'concept_id': '4030206',
 'concept_name': "Adrenal Cushing's syndrome",
 'domain_id': 'Condition',
 'vocabulary_id': 'SNOMED',
 'concept_class_id': 'Disorder',
 'standard_concept': 'S',
 'concept_code': '237735008',
 'valid_start_date': '20020131',
 'valid_end_date': '20991231',
 'invalid_reason': None}

In [6]:
import pandas as pd

df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT.csv', sep='\t')
df.head()

  df = pd.read_csv('/home/work/skku/hyo/omop-mapper/data/CONCEPT.csv', sep='\t')


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [7]:
df['standard_concept'].unique()

array(['S', nan, 'C'], dtype=object)

In [4]:
from elasticsearch import Elasticsearch

# --- Elasticsearch 연결 정보 수정 ---
ES_HOST = "http://3.35.110.161:9200"
ES_USER = "elastic"
ES_PASSWORD = "snomed"
# ------------------------------------

try:
    # Elasticsearch 클라이언트 생성
    es = Elasticsearch(
        [ES_HOST],
        basic_auth=(ES_USER, ES_PASSWORD),
        request_timeout=30
    )

    # 클러스터의 모든 노드 정보 가져오기
    # 'settings'와 'roles' 정보만 필터링하여 요청
    nodes_info = es.nodes.info(metric=["settings", "roles"])

    print("--- Cluster Node Roles ---")

    ml_node_found = False
    for node_id, info in nodes_info["nodes"].items():
        node_name = info["name"]
        roles = info["roles"]
        
        print(f"Node Name: {node_name}")
        print(f"  - Roles: {roles}")
        
        if "ml" in roles:
            ml_node_found = True
            print("  - ✅ This node is an ML node.")
        else:
            print("  - ❌ This node is NOT an ML node.")
        print("-" * 20)

    if not ml_node_found:
        print("\n[CRITICAL] No ML nodes found in the cluster!")
        print("Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it.")
    else:
        print("\n[INFO] ML node(s) found in the cluster.")

except Exception as e:
    print(f"An error occurred: {e}")

--- Cluster Node Roles ---

[CRITICAL] No ML nodes found in the cluster!
Please edit 'elasticsearch.yml' on at least one node to add the 'ml' role and restart it.


In [4]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

# replace with your own list of entity names
all_names = ["covid-19", "Coronavirus infection", "high fever", "Tumor of posterior wall of oropharynx"] 

bs = 128 # batch size during inference
all_embs = []
for i in tqdm(np.arange(0, len(all_names), bs)):
    toks = tokenizer.batch_encode_plus(all_names[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    all_embs.append(cls_rep.cpu().detach().numpy())

all_embs = np.concatenate(all_embs, axis=0)

100%|██████████| 1/1 [00:00<00:00,  2.22it/s]


In [5]:
all_embs

array([[-0.6451716 , -0.38790604, -0.2130785 , ...,  0.24005862,
         0.80429304,  0.42398137],
       [-1.1887959 , -0.27441087,  0.2855798 , ..., -0.12620574,
         0.80619425,  0.06771377],
       [-0.13085003,  0.43954894, -0.14286613, ..., -0.31815034,
         0.37861803,  0.1544096 ],
       [-0.8676109 ,  0.00507731, -0.38106015, ...,  0.18329258,
         0.8961128 , -0.29551512]], shape=(4, 768), dtype=float32)

In [3]:
from transformers import AutoTokenizer, AutoModel  
tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext") #.cuda(1)

In [2]:
import sys, torch
print(sys.version)
print(torch.__version__)
from transformers import AutoTokenizer, AutoModel

3.12.11 | packaged by conda-forge | (main, Jun  4 2025, 14:45:31) [GCC 13.3.0]
2.8.0+cu128


In [1]:
from transformers.models.bert import BertModel, BertTokenizer
m = BertModel.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root '{project_root}' added to sys.path.")

Project root '/home/work/skku/hyo/omop-mapper' added to sys.path.


In [2]:
from src.omop_mapper.entity_mapping_api import map_single_entity_debug, EntityMappingAPI

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging

logger = logging.getLogger(__name__)

In [24]:
out = map_single_entity_debug(
    entity_name="Adrenal Cushing's syndrome",
    entity_type="diagnostic",
    domain_id="condition"
)

print("Best:", out["result"])
for c in out["candidates"][:5]:
    print(c["concept_id"], c["concept_name"],
          f"text={c['text_similarity']:.3f}",
          f"sem={c['semantic_similarity']:.3f}",
          f"final={c['final_score']:.3f}")

✅ URL 방식으로 Elasticsearch 클라이언트 생성 성공
✅ Elasticsearch 연결 성공: http://3.35.110.161:9200
✅ ElasticsearchClient 초기화 완료 - Elasticsearch 클라이언트 사용 가능 (3.35.110.161:9200)
Best: {'concept_id': '4030206', 'concept_name': "Adrenal Cushing's syndrome", 'score': 0.39999999999999997, 'confidence': 'very_low', 'method': 'direct_standard'}
4030206 Adrenal Cushing's syndrome text=1.000 sem=0.000 final=0.400
195212 Hypercortisolism text=0.185 sem=0.000 final=0.074


In [3]:
api = EntityMappingAPI()
info = api.debug_dump_sample_embedding(index="concept", size=1)
print(info)

✅ URL 방식으로 Elasticsearch 클라이언트 생성 성공
✅ Elasticsearch 연결 성공: http://3.35.110.161:9200
✅ ElasticsearchClient 초기화 완료 - Elasticsearch 클라이언트 사용 가능 (3.35.110.161:9200)
{'id': '45756805', 'has_embedding': True, 'embedding_len': 768, 'known_embedding_keys': ['concept_embedding', 'embedding', 'sapbert_embedding', 'embedding_vector', 'embedding_values', 'vector', 'values', 'data'], 'keys': ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason', 'concept_embedding']}


In [4]:
api.debug_compute_semantic_similarity("Cushing syndrome", concept_id="4030206", index="concept")

조회된 문서 키 수: 10, 일부: ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason']
해당 문서에서 임베딩을 찾지 못했습니다
