## Embeddings


In [1]:
from dotenv import load_dotenv

In [4]:
import json

from langchain.schema import Document
from langchain.vectorstores import FAISS
import json
from langchain.schema import Document
import pandas as pd
import numpy as np
import duckdb
from langchain_openai import OpenAIEmbeddings

In [5]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
medical_codes = []
with open("data/ciselniky/vykon.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = str(json.loads(line))
        medical_codes.append(data)

In [7]:
code_embeddings = embedding_model.embed_documents(medical_codes)

In [8]:
# Wrap medical codes in Documents
code_docs = [Document(page_content=code) for code in medical_codes]

# Create FAISS index
vector_store = FAISS.from_documents(code_docs, embedding_model)

In [9]:
# Params
zprava = """
MUDr. Kryštofová Dominika
C163 
Pacientka se dostavila na 2.lůžkovou stanici k podání transfuzí - na Hb 80g/l. V čase od 11.25 do 13.00 podány 2 TU ERY, krevní skupiny AB Rh + - bez komplikací. 
Pacientka odchází bez známek dechové či oběhové nedostatečnosti.
"""
# zprava = "ODBĚR KRVE Z ARTERIE"

k = 5

In [10]:
relevant_docs = vector_store.similarity_search(zprava, k=k)
docs = []
print(f"Zprava: {zprava}")
print(f"Matched Codes:")
for i in range(k):
    doc = relevant_docs[i].page_content
    docs.append(relevant_docs[i].page_content)
    print(doc)

Zprava: 
MUDr. Kryštofová Dominika
C163 
Pacientka se dostavila na 2.lůžkovou stanici k podání transfuzí - na Hb 80g/l. V čase od 11.25 do 13.00 podány 2 TU ERY, krevní skupiny AB Rh + - bez komplikací. 
Pacientka odchází bez známek dechové či oběhové nedostatečnosti.

Matched Codes:
{'code': 9113, 'name': 'ODBĚR KRVE Z ARTERIE', 'description': 'U pacientů s respirační insuficiencí před indikací DDOT lze vykázat 6/1 den.', 'odbornost': '999'}
{'code': 9119, 'name': 'ODBĚR KRVE ZE ŽÍLY U DOSPĚLÉHO NEBO DÍTĚTE NAD 10 LET', 'description': 'Jde o odběr krve ze žíly vpichem. Krev slouží k diagnostickým vyšetřením v laboratoři. Jen vyjímečně může jít o výkon léčebný. V případě kombinace s výkonem č. 81443 (oGTT) lze vykázat 3/1 den.', 'odbornost': '999'}
{'code': 22365, 'name': 'ODBĚR PERIFERNÍCH KMENOVÝCH BUŇEK', 'description': 'Odběr periferních kmenových hemopoetických buněk (pro jejich transplantaci) technikou průtokové separace na separátoru krevních elementů.', 'odbornost': '222'}
{'co

In [11]:
import ast
docs_dict = [ast.literal_eval(item) for item in docs]
df_docs = pd.DataFrame(docs_dict)


In [12]:
vykony = pd.read_csv("data/vykazy/vyk_23_vykony_new.csv", encoding="windows-1252", sep=";")

## Occurences stats

In [42]:
vykony_pivot = pd.get_dummies(vykony.set_index('CDOKL')['KOD']).groupby('CDOKL').sum()
co_occurrence_matrix = np.dot(vykony_pivot.T, vykony_pivot)
np.fill_diagonal(co_occurrence_matrix, 0)
co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vykony_pivot.columns, columns=vykony_pivot.columns)

In [None]:
co_occurrence_df_normalized = (co_occurrence_df - co_occurrence_df.min()) / (co_occurrence_df.max() - co_occurrence_df.min())
co_occurrence_df_normalized.fillna(0, inplace=True)
co_occurrence_df_normalized.reset_index(inplace=True)
co_occurrence_df_normalized.rename(columns={'index': 'kod'}, inplace=True)

dfs = []
for code in df_docs["code"].tolist():
    df = co_occurrence_df_normalized[["kod", code]]
    df = df[df["kod"].isin([42022, 9543])]
    df = df[df[code] >= 0.6]  # some threshold
    df = df.sort_values(by=code, ascending=False)
    df = df.reset_index(drop=True)
    if len(df) > 0:
        dfs.append(df)
        display(df)

Unnamed: 0,kod,9113
0,9543,0.615385


Unnamed: 0,kod,9119
0,42022,1.0
1,9543,0.691608


Unnamed: 0,kod,9117
0,42022,1.0


Unnamed: 0,kod,51811
0,9543,1.0
1,42022,1.0


In [51]:
selected_col = 9223
display(co_occurrence_df_normalized[["kod", selected_col]].sort_values(by=selected_col, ascending=False))

Unnamed: 0,kod,9223
48,42520,1.000000
31,9543,0.996774
43,42022,0.724698
30,9541,0.556817
15,9219,0.375433
...,...,...
56,63531,0.000000
57,63532,0.000000
58,76211,0.000000
59,76213,0.000000


In [52]:
co_occurrence_df_normalized

Unnamed: 0,kod,41,42,6134,9111,9113,9115,9117,9119,9125,...,99878,99879,99880,99881,99882,99883,99893,99895,99976,99991
0,41,0.000000,0.0,0.016949,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.001726,0.000000
1,42,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000189,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
2,6134,0.176471,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.137255,0.022222,0.010753,0.0,0.0,0.0,0.000,0.007249,0.000000
3,9111,0.000000,0.0,0.000000,0.0,0.0,0.041401,0.000627,0.000568,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
4,9113,0.000000,0.0,0.000000,0.0,0.0,0.003185,0.000000,0.000758,0.003196,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,99883,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.125,0.000000,0.000000
83,99893,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
84,99895,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.1,0.0,0.000,0.000000,0.000000
85,99976,0.294118,0.0,0.118644,0.0,0.0,0.000000,0.000000,0.183368,0.000000,...,0.0,0.039216,0.033333,0.107527,0.0,0.0,0.0,0.000,0.000000,0.001513
