In [1]:

import json
import pickle
import os
import pandas as pd
from unidecode import unidecode

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage

from langchain_huggingface import HuggingFaceEmbeddings

from line_profiler import profile

from langchain_core.vectorstores import InMemoryVectorStore


from ipywidgets import IntProgress
from IPython.display import display
import time



from langchain_openai import OpenAIEmbeddings


In [2]:
zp_typ_codes = [
    "01",
    "02", # ZP pro inkontinentní pacienty"
    "02", # ZP pro inkontinentní pacienty
    "03", # ZP pro pacienty se stomií
    "04", # ZP ortopedicko-protetické a ortopedická obuv
    "05", # ZP pro pacienty s diabetem a s jinými poruchami metabolismu
    "07",  # ZP pro pacienty s poruchou mobility
    "08", #Příslušenství ke sluchadlům na zakázku
    "08", # ZP pro pacienty s poruchou sluchu,
    "09", # ZP pro korekci zraku a ZP kompenzační pro zrakově postižené na zakázku
    "09", # ZP pro pacienty s poruchou zraku
    "10", # ZP respirační, inhalační a pro aplikaci enterální výživy
    "11" # ZP nekategorizované

]

zp_typ_codes = list(map(lambda x: float(x), zp_typ_codes))

In [3]:
EMBEDDINGS = "openai-embeddings"
# EMBEDDINGS = "huggingface"
HUGGINGFACE_MODEL = "sentence-transformers/all-mpnet-base-v2"


In [4]:
if EMBEDDINGS == "huggingface":
    embeddings = HuggingFaceEmbeddings(model_name=HUGGINGFACE_MODEL)
elif EMBEDDINGS == "openai-embeddings":
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [5]:
vectorstore_path = os.path.join("data", EMBEDDINGS)
if not os.path.exists(vectorstore_path):
    os.makedirs(vectorstore_path)

In [6]:
from json import JSONDecodeError

try:
    vector_store = InMemoryVectorStore.load(f"data/{EMBEDDINGS}/vectorstore", embedding=embeddings)
except BaseException as e:
    vector_store = InMemoryVectorStore(embedding=embeddings)
    

In [7]:
items = vector_store.store
ids = list(items.keys())


In [8]:
# calculate running average
def running_average(new_value, current_average, count):
    if count == 0:
        return new_value
    return (current_average * count + new_value) / (count + 1)

In [9]:
# INIT vectorstorre


materialy_df = pd.read_json("data/ciselniky/materialy.jsonl", orient="recods", lines=True, encoding="utf-8")
# feed vectorstore with materials

# select only relevant records = 
# 
relevant_records = materialy_df[(materialy_df["file_code"] == "ZP") & (materialy_df["type"].isin(zp_typ_codes)) & (~materialy_df["code"].astype(str).isin(ids))]
relevant_records.reset_index(drop=True, inplace=True)



count = 0

avergage_duration = 0.0

relevant_records = relevant_records
max_count = len(relevant_records)


def index_row(row):
    global count
    if str(row["code"]) in ids:
        return
        
    # TODO: ADD TOKENIZER TO REMOVE SPECIAL CHARACTES řěšýň...
    # Combine 'name' and 'description' attributes for page_content
    PAGE_CONTENT = f"{row.get('name', '')} {row.get('description', '')}".strip()
    
    row["record"] = "material"
    
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    try:
        start = time.time()
        vector_store.add_documents([doc], ids=[row["code"]])
        end = time.time()

        duration = end - start
        global avergage_duration
        avergage_duration = running_average(duration, avergage_duration, count)

        print("left time: {:.2f} seconds".format(avergage_duration * (max_count - count)))

        # print(f"Indexed code {row['code']} in {duration:.2f} seconds")



        count += 1
        # while count <= max_count:
            # f.value += 1 # signal to increment the progress bar
            # time.sleep(.1)
    except BaseException as e:
        
        print(e)
        return




print("indexing {} records".format(max_count))
relevant_records.apply(index_row, axis=1)
vector_store.dump(os.path.join(vectorstore_path, "vectorstore"))







indexing 16866 records
left time: 20460.50 seconds
left time: 12174.02 seconds
left time: 9321.99 seconds
left time: 9253.71 seconds
left time: 9371.17 seconds
left time: 8465.61 seconds
left time: 7742.27 seconds
left time: 7235.74 seconds
left time: 7254.67 seconds
left time: 7774.79 seconds
left time: 7521.72 seconds
left time: 7339.76 seconds
left time: 7437.47 seconds
left time: 7248.94 seconds
left time: 7039.60 seconds
left time: 6880.43 seconds
left time: 6684.52 seconds
left time: 6788.23 seconds
left time: 6677.58 seconds
left time: 6556.89 seconds
left time: 6449.18 seconds
left time: 6348.12 seconds
left time: 6310.22 seconds
left time: 6243.14 seconds
left time: 6232.78 seconds
left time: 6190.06 seconds
left time: 6100.75 seconds
left time: 6046.17 seconds
left time: 5966.15 seconds
left time: 6024.47 seconds
left time: 5943.69 seconds
left time: 6037.16 seconds
left time: 5965.65 seconds
left time: 6043.80 seconds
left time: 5979.22 seconds
left time: 5910.14 seconds
lef

In [17]:
from agent import utils


vector_store.similarity_search(" inkontinenční vložky (Tena Lady) ", k=5, filter=utils.vector_material_filter_funct)

[Document(id='5014472', metadata={'code': 5014472, 'name': 'TENA LADY SLIM EXTRA', 'description': 'VLOŽKY ABSORPČNÍ,522ML,20KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='TENA LADY SLIM EXTRA VLOŽKY ABSORPČNÍ,522ML,20KS'),
 Document(id='5014471', metadata={'code': 5014471, 'name': 'TENA LADY SLIM EXTRA PLUS', 'description': 'VLOŽKY ABSORPČNÍ,580ML,16KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='TENA LADY SLIM EXTRA PLUS VLOŽKY ABSORPČNÍ,580ML,16KS'),
 Document(id='5007606', metadata={'code': 5007606, 'name': 'TENA LADY EXTRA', 'description': 'VLOŽKY ABSORPČNÍ,522ML,20KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='TENA LADY

In [None]:
for _, row in relevant_records.iterrows():
    
    # print(item)
    # for every 100th record, print a message
    if int(_) % 100 == 0:
        print(f"processing record {_} / {max_count}") 
    # implement logic to feed the vectorstore
    # feed only ZP for now 
    
    # for every 1000th record, print a message
    if int(_) % 100 == 0:
        vector_store.dump(vectorstore_path)

    if str(row["code"]) in ids:
        f.value += 1 # signal to increment the progress bar
        
        time.sleep(.1)
        count += 1
        print(f"skipping code {row["code"]}")
        continue
        
    # TODO: ADD TOKENIZER TO REMOVE SPECIAL CHARACTES řěšýň...
    # Combine 'name' and 'description' attributes for page_content
    PAGE_CONTENT = f"{row.get('name', '')} {row.get('description', '')}".strip()
    
    row["record"] = "material"
    
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    try:
        vector_store.add_documents([doc], ids=[row["code"]])
        while count <= max_count:
            f.value += 1 # signal to increment the progress bar
            time.sleep(.1)
            count += 1
    except BaseException as e:
        
        print(e)
        continue
    

NameError: name 'relevant_records' is not defined

In [None]:
vector_store.store

In [113]:

vykony_df = pd.read_json("data/ciselniky/vykon.jsonl", orient="recods", lines=True, encoding="utf-8")

    
for _, row in vykony_df.iterrows():
            
    
    # Combine 'NAZ' and 'TXT' attributes for page_content
    PAGE_CONTENT = f"{row.get('NAZ', '')} {row.get('TEXT', '')}".strip()
    
    row["record"] = "vykon"
    # Store all attributes in metadata
    doc = Document(page_content=PAGE_CONTENT, metadata=row.to_dict())
    
    vector_store.add_documents([doc], ids=[row["KOD"]])

In [8]:
def filter_funct(doc): 
    return doc.metadata.get("record") == "material"

In [156]:
from langchain_openai import OpenAI


detection_model = OpenAI()

In [None]:
detection_model.invoke(SystemMessage(content="ve ambulantni zpravy najdi materialy, leky, vykony a zdravodnicky pomucky. use explicit text"),
        HumanMessage(content=state["text"]))

In [15]:

vector_store.similarity_search_with_score("MoliCare Premium)", k=20, 
    filter=filter_funct
)

[(Document(id='5003680', metadata={'code': 5003680, 'name': 'ABRI FLEX PREMIUM M2', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 1900ML, 14KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='ABRI FLEX PREMIUM M2 KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 1900ML, 14KS'),
  0.8146972069951728),
 (Document(id='5003691', metadata={'code': 5003691, 'name': 'ABRI FLEX PREMIUM M3', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 2200ML, 14KS', 'type': 2.0, 'limit': 'GER, GYN, CHI, NEF, NEU, PED, PRL, SDP, SLO, URN', 'file': 'Zdravotnické prostředky', 'file_code': 'ZP', 'record': 'material'}, page_content='ABRI FLEX PREMIUM M3 KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-110CM, 2200ML, 14KS'),
  0.8145818518038517),
 (Document(id='5003671', metadata={'code': 5003671, 'name': 'ABRI FLEX PREMIUM M1', 'description': 'KALHOTKY ABSORPČNÍ NAVLÉKACÍ, BOKY 80-11