In [16]:

import json
import pickle
import os
import pandas as pd
from unidecode import unidecode

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage

from langchain_huggingface import HuggingFaceEmbeddings

from line_profiler import profile

from langchain_core.vectorstores import InMemoryVectorStore


from ipywidgets import IntProgress
from IPython.display import display
import time



from langchain_openai import OpenAIEmbeddings


In [17]:
zp_typ_codes = [
    "01",
    "02", # ZP pro inkontinentní pacienty"
    "02", # ZP pro inkontinentní pacienty
    "03", # ZP pro pacienty se stomií
    "04", # ZP ortopedicko-protetické a ortopedická obuv
    "05", # ZP pro pacienty s diabetem a s jinými poruchami metabolismu
    "07",  # ZP pro pacienty s poruchou mobility
    "08", #Příslušenství ke sluchadlům na zakázku
    "08", # ZP pro pacienty s poruchou sluchu,
    "09", # ZP pro korekci zraku a ZP kompenzační pro zrakově postižené na zakázku
    "09", # ZP pro pacienty s poruchou zraku
    "10", # ZP respirační, inhalační a pro aplikaci enterální výživy
    "11" # ZP nekategorizované

]

zp_typ_codes = list(map(lambda x: float(x), zp_typ_codes))

In [18]:
# EMBEDDINGS = "huggingface-retromae-small-cs" # or "openai-embeddings"
EMBEDDINGS = "huggingface"
HUGGINGFACE_MODEL = "sentence-transformers/all-mpnet-base-v2"


In [19]:
if EMBEDDINGS == "huggingface":
    embeddings = HuggingFaceEmbeddings(model_name=HUGGINGFACE_MODEL)
elif EMBEDDINGS == "openai-":
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [20]:
vectorstore_path = os.path.join("data", EMBEDDINGS)
if not os.path.exists(vectorstore_path):
    os.makedirs(vectorstore_path)

In [21]:
from json import JSONDecodeError

try:
    vector_store = InMemoryVectorStore.load(f"data/{EMBEDDINGS}/vectorstore", embedding=embeddings)
except BaseException as e:
    vector_store = InMemoryVectorStore(embedding=embeddings)
    

In [22]:
items = vector_store.store
ids = list(items.keys())


In [23]:
# calculate running average
def running_average(new_value, current_average, count):
    if count == 0:
        return new_value
    return (current_average * count + new_value) / (count + 1)

In [24]:
# INIT vectorstorre


materialy_df = pd.read_json("data/ciselniky/materialy.jsonl", orient="recods", lines=True, encoding="utf-8")
# feed vectorstore with materials

# select only relevant records = 
# 
relevant_records = materialy_df[(materialy_df["file_code"] == "ZP") & (materialy_df["type"].isin(zp_typ_codes)) & (~materialy_df["code"].astype(str).isin(ids))]
relevant_records.reset_index(drop=True, inplace=True)



count = 0

avergage_duration = 0.0

relevant_records = relevant_records
max_count = len(relevant_records)


def index_row(row):
    global count
    if str(row["code"]) in ids:
        return
        
    # TODO: ADD TOKENIZER TO REMOVE SPECIAL CHARACTES řěšýň...
    # Combine 'name' and 'description' attributes for page_content
    PAGE_CONTENT = f"{row.get('name', '')} {row.get('description', '')}".strip()
    
    row["record"] = "material"
    
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    try:
        start = time.time()
        vector_store.add_documents([doc], ids=[row["code"]])
        end = time.time()

        duration = end - start
        global avergage_duration
        avergage_duration = running_average(duration, avergage_duration, count)

        print("left time: {:.2f} seconds".format(avergage_duration * (max_count - count)))

        # print(f"Indexed code {row['code']} in {duration:.2f} seconds")



        count += 1
        # while count <= max_count:
            # f.value += 1 # signal to increment the progress bar
            # time.sleep(.1)
    except BaseException as e:
        
        print(e)
        return




print("indexing {} records".format(max_count))
relevant_records.apply(index_row, axis=1)
vector_store.dump(os.path.join(vectorstore_path, "vectorstore"))







indexing 2566 records
left time: 761.17 seconds
left time: 405.79 seconds
left time: 280.00 seconds
left time: 235.93 seconds
left time: 195.35 seconds
left time: 177.60 seconds
left time: 159.52 seconds
left time: 149.06 seconds
left time: 142.82 seconds
left time: 137.55 seconds
left time: 128.96 seconds
left time: 124.86 seconds
left time: 119.88 seconds
left time: 118.48 seconds
left time: 117.25 seconds
left time: 114.49 seconds
left time: 111.42 seconds
left time: 110.17 seconds
left time: 106.35 seconds
left time: 103.19 seconds
left time: 102.17 seconds
left time: 100.90 seconds
left time: 99.86 seconds
left time: 97.43 seconds
left time: 96.99 seconds
left time: 96.46 seconds
left time: 94.82 seconds
left time: 93.11 seconds
left time: 91.95 seconds
left time: 90.66 seconds
left time: 89.29 seconds
left time: 88.35 seconds
left time: 88.63 seconds
left time: 86.86 seconds
left time: 85.28 seconds
left time: 85.24 seconds
left time: 84.04 seconds
left time: 83.95 seconds
left t

In [30]:
from agent import utils


vector_store.similarity_search("Omnifix", k=5, filter=utils.vector_material_filter_funct)

[Document(id='5004850', metadata={'code': 5004850, 'name': 'OMNITEST 5', 'description': 'BÍLÝ', 'type': 5.0, 'limit': 'DIA, PRL, INT', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='OMNITEST 5 BÍLÝ'),
 Document(id='5009933', metadata={'code': 5009933, 'name': 'FIXACE HYPOALERGENNÍ OMNIFIX ELASTIC', 'description': '10CMX2M,1KS', 'type': 1.0, 'limit': 'lékař, SDP, SLO', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='FIXACE HYPOALERGENNÍ OMNIFIX ELASTIC 10CMX2M,1KS'),
 Document(id='5009956', metadata={'code': 5009956, 'name': 'FIXACE HYPOALERGENNÍ OMNIFIX ELASTIC', 'description': '10CMX10M,1KS', 'type': 1.0, 'limit': 'lékař, SDP, SLO', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='FIXACE HYPOALERGENNÍ OMNIFIX ELASTIC 10CMX10M,1KS'),
 Document(id='5009934', metadata={'code': 5009934, 'name': 'FIXACE HYPOALERGENNÍ OMNIFIX ELASTIC', 'description': '5CMX10M,1KS'

In [None]:
for _, row in relevant_records.iterrows():
    
    # print(item)
    # for every 100th record, print a message
    if int(_) % 100 == 0:
        print(f"processing record {_} / {max_count}") 
    # implement logic to feed the vectorstore
    # feed only ZP for now 
    
    # for every 1000th record, print a message
    if int(_) % 100 == 0:
        vector_store.dump(vectorstore_path)

    if str(row["code"]) in ids:
        f.value += 1 # signal to increment the progress bar
        
        time.sleep(.1)
        count += 1
        print(f"skipping code {row["code"]}")
        continue
        
    # TODO: ADD TOKENIZER TO REMOVE SPECIAL CHARACTES řěšýň...
    # Combine 'name' and 'description' attributes for page_content
    PAGE_CONTENT = f"{row.get('name', '')} {row.get('description', '')}".strip()
    
    row["record"] = "material"
    
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    try:
        vector_store.add_documents([doc], ids=[row["code"]])
        while count <= max_count:
            f.value += 1 # signal to increment the progress bar
            time.sleep(.1)
            count += 1
    except BaseException as e:
        
        print(e)
        continue
    

NameError: name 'relevant_records' is not defined

In [None]:
vector_store.store

In [113]:

vykony_df = pd.read_json("data/ciselniky/vykon.jsonl", orient="recods", lines=True, encoding="utf-8")

    
for _, row in vykony_df.iterrows():
            
    
    # Combine 'NAZ' and 'TXT' attributes for page_content
    PAGE_CONTENT = f"{row.get('NAZ', '')} {row.get('TEXT', '')}".strip()
    
    row["record"] = "vykon"
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    
    vector_store.add_documents([doc], ids=[row["KOD"]])

In [8]:
def filter_funct(doc): 
    return doc.metadata.get("record") == "material"

In [156]:
from langchain_openai import OpenAI


detection_model = OpenAI()

In [None]:
detection_model.invoke(SystemMessage(content="ve ambulantni zpravy najdi materialy, leky, vykony a zdravodnicky pomucky. use explicit text"),
        HumanMessage(content=state["text"]))

In [15]:

vector_store.similarity_search_with_score("MoliCare Premium)", k=20, 
    filter=filter_funct
)

[(Document(id='5003680', metadata={'code': 5003680, 'name': 'ABRI FLEX PREMIUM M2', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 1900ML, 14KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='ABRI FLEX PREMIUM M2 KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 1900ML, 14KS'),
  0.8146972069951728),
 (Document(id='5003691', metadata={'code': 5003691, 'name': 'ABRI FLEX PREMIUM M3', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 2200ML, 14KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='ABRI FLEX PREMIUM M3 KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 2200ML, 14KS'),
  0.8145818518038517),
 (Document(id='5003671', metadata={'code': 5003671, 'name': 'ABRI FLEX PREMIUM M1', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-11