## Embeddings

In [None]:
from dotenv import load_dotenv

In [None]:
import json

from langchain.schema import Document
from langchain.vectorstores import FAISS
import json
from langchain.schema import Document
import pandas as pd
import numpy as np
import duckdb
from langchain_openai import OpenAIEmbeddings

In [None]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
medical_codes = []
with open("data/ciselniky/vykon.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = str(json.loads(line))
        medical_codes.append(data)

In [None]:
code_embeddings = embedding_model.embed_documents(medical_codes)

In [None]:
# Wrap medical codes in Documents
code_docs = [Document(page_content=code) for code in medical_codes]

# Create FAISS index
vector_store = FAISS.from_documents(code_docs, embedding_model)

vector_store.save_local("faiss_index")

In [None]:
# Params
zprava = "ODBĚR KRVE Z ARTERIE"

k = 5

In [None]:
relevant_docs = vector_store.similarity_search(zprava, k=k)
docs = []
print(f"Zprava: {zprava}")
print(f"Matched Codes:")
for i in range(k):
    doc = relevant_docs[i].page_content
    docs.append(relevant_docs[i].page_content)
    print(doc)

In [None]:
import ast
docs_dict = [ast.literal_eval(item) for item in docs]
df_docs = pd.DataFrame(docs_dict)


## Occurences stats

In [None]:
from functools import lru_cache

vykony = pd.read_csv("data/vykazy/vyk_23_vykony_new.csv", encoding="windows-1252", sep=";")
@lru_cache(maxsize=None)
def get_df_normalized_co_occurrence() -> pd.DataFrame:
    vykony_pivot = pd.get_dummies(vykony.set_index('CDOKL')['KOD']).groupby('CDOKL').sum()
    co_occurrence_matrix = np.dot(vykony_pivot.T, vykony_pivot)
    np.fill_diagonal(co_occurrence_matrix, 0)
    co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vykony_pivot.columns, columns=vykony_pivot.columns)
    df_normalized_co_occurrence = (co_occurrence_df - co_occurrence_df.min()) / (co_occurrence_df.max() - co_occurrence_df.min())
    df_normalized_co_occurrence.fillna(0, inplace=True)
    df_normalized_co_occurrence.reset_index(inplace=True)
    df_normalized_co_occurrence.rename(columns={'index': 'kod'}, inplace=True)
    return df_normalized_co_occurrence

In [None]:
df_normalized_co_occurrence = get_df_normalized_co_occurrence()

res = []
for code in df_docs["code"].tolist():
    df = df_normalized_co_occurrence[["kod", code]]
    df = df[df["kod"].isin([42022, 9543])]
    df = df[df[code] >= 0.6]  # some threshold
    df = df.sort_values(by=code, ascending=False)
    df = df.reset_index(drop=True)
    if len(df) > 0:
        res.extend(df["kod"].tolist())
        
list(set(res))

In [None]:
selected_col = 9223
display(df_normalized_co_occurrence[["kod", selected_col]].sort_values(by=selected_col, ascending=False))

In [None]:
df_normalized_co_occurrence

In [None]:
# Convert to DataFrame for easier visualization
import plotly.express as px

fig = px.imshow(df_normalized_co_occurrence, 
                labels=dict(x="Code 1", y="Code 2", color="Co-occurrence Count"),
                title="Code Co-occurrence Matrix",
                color_continuous_scale="YlGnBu")

# Show the plot
fig.show()


In [None]:
df_codes_relationships = duckdb.sql(
    """
    with occurrences as (
        select 
            v1.cdokl,
            v1.kod as kod_1,
            v2.kod as kod_2,
            least(kod_1) as kod1,
            greatest(kod_2) as kod2,
        from vykony v1
            full join vykony v2 using (cdokl)
        where 
            true
            -- and cdokl in (98806, 271592)
            and kod_1 > kod_2
        order by cdokl, kod_1
    )
    select 
        kod1::text as kod1,
        kod2::text as kod2,
        count(*) as occurences_in_cdokl_cnt
    from occurrences
    group by 
        kod1,
        kod2
    order by kod1
    """
).df()
df_codes_relationships

In [None]:
heatmap_data = df_codes_relationships.pivot(index='kod1', columns='kod2', values='occurences_in_cdokl_cnt')

# Take log of values (base 10)
log_data = np.log10(heatmap_data)

# Plot heatmap
fig = px.imshow(log_data,
                text_auto=True,
                color_continuous_scale='Viridis',
                labels={'x': 'Kod 1', 'y': 'Kod 2', 'color': 'log₁₀(Počet výskytů ve výkonech)'},
                title='Vztah kódů ve výkonech')
fig.update_traces(text=heatmap_data.values.astype(str))  # optional: format nicely with .2f
fig.update_layout(width=1200, height=900)  # or any size you want
fig.show()

In [None]:
import plotly.graph_objects as go


# Pivot for heatmap
matrix = df_codes_relationships.pivot(index='kod2', columns='kod1', values='occurences_in_cdokl_cnt')
log_matrix = np.log10(matrix)

# Custom hover text showing original values
hover_text = [[f"Kód 1: {k1}<br>Kód 2: {k2}<br>Počet výskytů ve výkonech: {matrix.loc[k2, k1]}"
               for k1 in matrix.columns] for k2 in matrix.index]

# Plot heatmap
fig = go.Figure(data=go.Heatmap(
    z=log_matrix.values,
    x=matrix.columns,
    y=matrix.index,
    text=hover_text,
    hoverinfo='text',
    colorscale='Viridis',
    colorbar=dict(title='log₁₀(Počet výskytů ve výkonech)')
))

fig.update_layout(
    title='Vztah kódů ve výkonech',
    xaxis_title='Kód 1',
    yaxis_title='Kód 2',
    width=800,
    height=600
)

fig.show()
