## Embeddings

In [1]:
from dotenv import load_dotenv

In [2]:
import json

from langchain.schema import Document
from langchain.vectorstores import FAISS
import json
from langchain.schema import Document
import pandas as pd
import numpy as np
import duckdb
from langchain_openai import OpenAIEmbeddings

In [3]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
medical_codes = []
with open("data/ciselniky/vykon.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = str(json.loads(line))
        medical_codes.append(data)

In [5]:
code_embeddings = embedding_model.embed_documents(medical_codes)

In [6]:
# Wrap medical codes in Documents
code_docs = [Document(page_content=code) for code in medical_codes]

# Create FAISS index
vector_store = FAISS.from_documents(code_docs, embedding_model)

vector_store.save_local("faiss_index")

In [7]:
# Params
zprava = """
MUDr. Čmejlová Vlastimila
C504 
Z511 
KOntrola: 

subj. mezi svátky prodělala chřipku a zápal plic, horečky měla, kašel velký. Nyní již trochu lepší stav.  Přála by si znovu nasadit X-gevu, kt. jsm,e vysadili pro osteonekrozu čelisti. 
obj. afebrilní, WHO 1

Kontrolní staging: 
6/19 , 7/20, 12/20, 4/21, 11/21  - MRI CNS bez nové patologie - kontroly na LGN
6/19, 11/19. 2/20, 5/20, 10/20, 5/21    PET/CT bez nových ložisek, regrese nálezu.

PET/CT VFN 4/22 : stabilizace onemocní, nová ložiska neprokázána, někde i hodnoceno jako parc. regrese. 

PET/CT VFN 12/2022 morfologická a metabol. progrese ve skeletu, ale žádná nová ložiska nejsou

Markery CA 15-3 akt. 152  ( v létě nad 200  )

dg: Ca mammae l.dx. cT2NXM0
Stp. biopsii tumoru 2006  Histologie: invazivní duktální ca G2
Stp. 6ti cyklech chemoterapie FAC 2006
Stp. parciální mastektomii s exenterací axily 2006
Histologie: ypT1cpN0 invazivní karcinom, ER a PR +
Stp. adjuvantní radioterapii na oblast prsu 2006
Hormonální terapie Tamoxifen, pro intoleranci Arimidex do 2012
Progrese onemocnění : elevace markerů + meta ve skeletu mnohoč., pleura a mediastin. LU, solit. meta v CNS 1/17
Stp. RT LGN na oblast meta v CNS 1/17
Stp. biopsii meta v oblasti kalvy 1/17
Histologie: osteolytická meta karcinomu ml. žlázy, ER 40%, PR 2%, ki67 20%. HER2 negativní
Stp. 4 cyklech 1.linie paliativní CHT paklitaxel + bevacizumab do 5/17
- dle PET/CT 5/17- 4/18 parciální remise onemocnění, normalizace markerů
Progrese ve skeletu a LU + elevace makerů 8/18 
Stp. 6 cyklech cisDDP mono do 12/18
Progrese ve skeletu mírně a elevace markerů 11/18
Paliativní CHT  XENA  do 8/21
Progrese ve skeletu dle scinti, elevace markerů 8/21
- patolog. fraktura fibuly vpravo, ošetřena ortopedem FNM
- Myocet + CFA in curzu
- 3/22 kardiální selhání s EF 40% v.s. následkem kardiotoxické CHT ( Myocet + CFA ) 
- PR dle PET/CT a normalizace markerů 4/22 
- stp. 1 cyklu Eribulinu - intolerance
- Stp. 1 cyklu CBDCA + gemcitabin - hematotoxicita, stp. 1 cyklu CBDCA mono - hematotoxicita
- HRT in curzu - inhibitory aromatázy
Progrese ve skeletu dle PETu ve 12/2022 

Dop. vzhledem ke stavu pacientky nadále anastrozol. Pokračujem X-geva - dnes aplikace, 3x vydáno domů. aplikuje sama. 

Pacient odchází z ambulance bez známek dechové a oběhové nedostatečnosti.
"""
# zprava = "ODBĚR KRVE Z ARTERIE"

k = 5

In [8]:
relevant_docs = vector_store.similarity_search(zprava, k=k)
docs = []
print(f"Zprava: {zprava}")
print(f"Matched Codes:")
for i in range(k):
    doc = relevant_docs[i].page_content
    docs.append(relevant_docs[i].page_content)
    print(doc)

Zprava: 
MUDr. Čmejlová Vlastimila
C504 
Z511 
KOntrola: 

subj. mezi svátky prodělala chřipku a zápal plic, horečky měla, kašel velký. Nyní již trochu lepší stav.  Přála by si znovu nasadit X-gevu, kt. jsm,e vysadili pro osteonekrozu čelisti. 
obj. afebrilní, WHO 1

Kontrolní staging: 
6/19 , 7/20, 12/20, 4/21, 11/21  - MRI CNS bez nové patologie - kontroly na LGN
6/19, 11/19. 2/20, 5/20, 10/20, 5/21    PET/CT bez nových ložisek, regrese nálezu.

PET/CT VFN 4/22 : stabilizace onemocní, nová ložiska neprokázána, někde i hodnoceno jako parc. regrese. 

PET/CT VFN 12/2022 morfologická a metabol. progrese ve skeletu, ale žádná nová ložiska nejsou

Markery CA 15-3 akt. 152  ( v létě nad 200  )

dg: Ca mammae l.dx. cT2NXM0
Stp. biopsii tumoru 2006  Histologie: invazivní duktální ca G2
Stp. 6ti cyklech chemoterapie FAC 2006
Stp. parciální mastektomii s exenterací axily 2006
Histologie: ypT1cpN0 invazivní karcinom, ER a PR +
Stp. adjuvantní radioterapii na oblast prsu 2006
Hormonální terapie 

In [9]:
import ast
docs_dict = [ast.literal_eval(item) for item in docs]
df_docs = pd.DataFrame(docs_dict)


## Occurences stats

In [None]:
from functools import lru_cache

vykony = pd.read_csv("data/vykazy/vyk_23_vykony_new.csv", encoding="windows-1252", sep=";")
@lru_cache(maxsize=None)
def get_df_normalized_co_occurrence() -> pd.DataFrame:
    vykony_pivot = pd.get_dummies(vykony.set_index('CDOKL')['KOD']).groupby('CDOKL').sum()
    co_occurrence_matrix = np.dot(vykony_pivot.T, vykony_pivot)
    np.fill_diagonal(co_occurrence_matrix, 0)
    co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vykony_pivot.columns, columns=vykony_pivot.columns)
    df_normalized_co_occurrence = (co_occurrence_df - co_occurrence_df.min()) / (co_occurrence_df.max() - co_occurrence_df.min())
    df_normalized_co_occurrence.fillna(0, inplace=True)
    df_normalized_co_occurrence.reset_index(inplace=True)
    df_normalized_co_occurrence.rename(columns={'index': 'kod'}, inplace=True)
    return df_normalized_co_occurrence

In [12]:
df_normalized_co_occurrence = get_df_normalized_co_occurrence()

res = []
for code in df_docs["code"].tolist():
    df = df_normalized_co_occurrence[["kod", code]]
    df = df[df["kod"].isin([42022, 9543])]
    df = df[df[code] >= 0.6]  # some threshold
    df = df.sort_values(by=code, ascending=False)
    df = df.reset_index(drop=True)
    if len(df) > 0:
        res.extend(df["kod"].tolist())
        
list(set(res))

[42022, 9543]

In [13]:
selected_col = 9223
display(df_normalized_co_occurrence[["kod", selected_col]].sort_values(by=selected_col, ascending=False))

Unnamed: 0,kod,9223
48,42520,1.000000
31,9543,0.996774
43,42022,0.724698
30,9541,0.556817
15,9219,0.375433
...,...,...
56,63531,0.000000
57,63532,0.000000
58,76211,0.000000
59,76213,0.000000


In [14]:
df_normalized_co_occurrence

Unnamed: 0,kod,41,42,6134,9111,9113,9115,9117,9119,9125,...,99878,99879,99880,99881,99882,99883,99893,99895,99976,99991
0,41,0.000000,0.0,0.016949,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.001726,0.000000
1,42,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000189,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
2,6134,0.176471,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.137255,0.022222,0.010753,0.0,0.0,0.0,0.000,0.007249,0.000000
3,9111,0.000000,0.0,0.000000,0.0,0.0,0.041401,0.000627,0.000568,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
4,9113,0.000000,0.0,0.000000,0.0,0.0,0.003185,0.000000,0.000758,0.003196,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,99883,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.125,0.000000,0.000000
83,99893,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000,0.000000,0.000000
84,99895,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.1,0.0,0.000,0.000000,0.000000
85,99976,0.294118,0.0,0.118644,0.0,0.0,0.000000,0.000000,0.183368,0.000000,...,0.0,0.039216,0.033333,0.107527,0.0,0.0,0.0,0.000,0.000000,0.001513


In [15]:
# Convert to DataFrame for easier visualization
import plotly.express as px

fig = px.imshow(df_normalized_co_occurrence, 
                labels=dict(x="Code 1", y="Code 2", color="Co-occurrence Count"),
                title="Code Co-occurrence Matrix",
                color_continuous_scale="YlGnBu")

# Show the plot
fig.show()


In [127]:
df_codes_relationships = duckdb.sql(
    """
    with occurrences as (
        select 
            v1.cdokl,
            v1.kod as kod_1,
            v2.kod as kod_2,
            least(kod_1) as kod1,
            greatest(kod_2) as kod2,
        from vykony v1
            full join vykony v2 using (cdokl)
        where 
            true
            -- and cdokl in (98806, 271592)
            and kod_1 > kod_2
        order by cdokl, kod_1
    )
    select 
        kod1::text as kod1,
        kod2::text as kod2,
        count(*) as occurences_in_cdokl_cnt
    from occurrences
    group by 
        kod1,
        kod2
    order by kod1
    """
).df()
df_codes_relationships

Unnamed: 0,kod1,kod2,occurences_in_cdokl_cnt
0,22114,9555,1
1,22114,9541,1
2,22114,9215,1
3,22114,9219,1
4,22114,9115,1
...,...,...,...
834,99991,99842,2
835,99991,9561,585
836,99991,99880,1
837,99991,99874,8


In [132]:
heatmap_data = df_codes_relationships.pivot(index='kod1', columns='kod2', values='occurences_in_cdokl_cnt')

# Take log of values (base 10)
log_data = np.log10(heatmap_data)

# Plot heatmap
fig = px.imshow(log_data,
                text_auto=True,
                color_continuous_scale='Viridis',
                labels={'x': 'Kod 1', 'y': 'Kod 2', 'color': 'log₁₀(Počet výskytů ve výkonech)'},
                title='Vztah kódů ve výkonech')
fig.update_traces(text=heatmap_data.values.astype(str))  # optional: format nicely with .2f
fig.update_layout(width=1200, height=900)  # or any size you want
fig.show()

In [130]:
import plotly.graph_objects as go


# Pivot for heatmap
matrix = df_codes_relationships.pivot(index='kod2', columns='kod1', values='occurences_in_cdokl_cnt')
log_matrix = np.log10(matrix)

# Custom hover text showing original values
hover_text = [[f"Kód 1: {k1}<br>Kód 2: {k2}<br>Počet výskytů ve výkonech: {matrix.loc[k2, k1]}"
               for k1 in matrix.columns] for k2 in matrix.index]

# Plot heatmap
fig = go.Figure(data=go.Heatmap(
    z=log_matrix.values,
    x=matrix.columns,
    y=matrix.index,
    text=hover_text,
    hoverinfo='text',
    colorscale='Viridis',
    colorbar=dict(title='log₁₀(Počet výskytů ve výkonech)')
))

fig.update_layout(
    title='Vztah kódů ve výkonech',
    xaxis_title='kod1',
    yaxis_title='kod2',
    width=800,
    height=600
)

fig.show()
