In [1]:
import getpass
import pprint
import os

from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

from openai import OpenAI

In [2]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, Docx2txtLoader, DirectoryLoader, UnstructuredWordDocumentLoader, UnstructuredExcelLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain_iris import IRISVector

In [3]:
import chromadb

# Initialize Chroma client
chromadb_client = chromadb.PersistentClient(path="./chroma_db")  # Saves data persistently

In [4]:
client = OpenAI()
embeddings = OpenAIEmbeddings()
embedding_model = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [5]:
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
# Under the hood, this becomes a SQL table. CANNOT have '.' in the name

In [6]:
loader = DirectoryLoader('data', glob='*.docx', loader_cls=Docx2txtLoader)
docs = loader.load()
len(docs)

11

In [7]:
text_splitter = SpacyTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(docs)



In [None]:
# COLLECTION_NAME = "cancer_db"
# # This creates a persistent vector store (a SQL table). You should run this ONCE only
# db = IRISVector.from_documents(
#     embedding=embeddings,
#     documents=docs,
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
# )

In [9]:
COLLECTION_NAME = "cancer_db"
# Create (or get) a collection
collection = chromadb_client.get_or_create_collection(name=COLLECTION_NAME)
embeddings = embedding_model.embed_documents(docs)
# Add documents to ChromaDB
collection.add(
    ids=[str(i) for i in range(len(docs))],  # Unique IDs for each document
    documents=docs,  # Raw text data
    embeddings=embeddings  # Precomputed embeddings
)

print("Documents successfully added to ChromaDB!")


TypeError: expected string or buffer

In [None]:
COLLECTION_NAME = "cancer_db"
# Subsequent calls to reconnect to the database and make searches should use this.  
db = IRISVector(
    embedding_function=embeddings,
    dimension=1536,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [None]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

In [None]:
f = open("data/s_test.txt", "r", encoding='ISO-8859-1')
# query = "new technology"
scenario = f.read()

f = open("data/knowledge.docx", "r", encoding='ISO-8859-1')
# query = "new technology"
knowledge = f.read()

In [None]:
docs_with_score = db.similarity_search_with_score(scenario, 2)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

In [None]:
embedding_vector = embeddings.embed_query(scenario)
res = db.similarity_search_by_vector(embedding_vector)
res

In [None]:
full_res = ''
for each_res in res:
    full_res = full_res + '\n\n' +each_res.page_content

In [None]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system", 
            "content": 
                f"""
                A medical doctor with domain knowledge in breast cancer after having trained with a wealth of knowledge in these topics: {knowledge}.
                Augment your data with results from {full_res}
                """
        },
        {
            "role": "user",
            "content": 
                f"""
                Given patient's consultation with the doctor in this {scenario}, 
                recommend 
                1. the best course of treatment
                2. provide justifications for the course
                3. provide chain of thought to reach those justifications
                4. highlight risks to patient
                """
        }
    ]
)


In [None]:
pprint.pp(completion.choices[0].message.content)

In [None]:
loader = CSVLoader('data/treatment_selection.csv')#, csv_args={'fieldnames':['']})
docs = loader.load()
len(docs)

In [None]:
COLLECTION_NAME = "pictures_db"
# This creates a persistent vector store (a SQL table). You should run this ONCE only
db1 = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [None]:
docs_with_score = db1.similarity_search_with_score(completion.choices[0].message.content, 1)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

In [None]:
image_chosen = doc.page_content.split('\n')[-1].split(': ')[-1] + ".jpeg"

In [None]:
image_chosen

# Misc

In [None]:
# completion = client.chat.completions.create(
#     model="gpt-4o-mini",
#     messages=[
#         {
#             "role": "system", 
#             "content": """
#             Factors For Lumpectomy:
# Breast Conservation – Preserves the natural breast, which may be important for some women, though Mdm Ang has indicated she may be able to accept a flat chest.
# Less Invasive Surgery – Typically, a shorter recovery time compared to a mastectomy.
# Factors Against Lumpectomy:
# Need for Frequent Follow-Ups – Requires post-surgical radiotherapy, which means multiple hospital visits, a significant concern for Mdm Ang since she finds it difficult to travel to the hospital frequently.
# Risk of Second Surgery – If the lumpectomy does not achieve clear margins, a second surgery may be required, which Mdm Ang wants to avoid.
# Overall Treatment Burden – The combination of surgery and radiotherapy means a longer treatment course, which may not be ideal given her preference for a one-time treatment.
# Since Mdm Ang prioritizes minimizing hospital visits and avoiding the possibility of a second surgery, mastectomy without reconstruction aligns better with her needs.
# """},
#         {
#             "role": "user",
#             "content": f"""
# 1) cannot accept mastectomy
# 2) wants reconstruction but cannot accept implant - can accept tram or LD flap 
# 3) only wants lumpectomy ok for second surgery
# 4) tumor too big but really only want breast conserving and considering oncoplastic surgery - accepting of a slightly longer scar to maintain symmetry of best 
# 5) big tumor but cannot accept mastectomy, discuss neoadjuvant chemotherapy- risks and benefits and agreeable for trial of nact before mastectomy
# 6) cost concerns. 
# Deciding between Breast conserving but with radiotherapy versus mastectomy without recon 
# Recon too expensive
# 7) concern about drain management and no caregiver - prefers fast recovery - lumpectomy 
# 8 ) doesn’t want radiotherapy strongly  - mastectomy the.
#             ."""
#         }
#     ]
# )

In [None]:
# from langchain.text_splitter import CharacterTextSplitter

# text = "Your long document text here..."

# splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=1000,
#     chunk_overlap=200
# )

# chunks = splitter.split_text(text) #you can also split documents using split_documents

# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text = "Your long document text here..."

# splitter = RecursiveCharacterTextSplitter(
#     separators=["\n\n", "\n", " ", ""],
#     chunk_size=1000,
#     chunk_overlap=200,
#     length_function=len
# )

# chunks = splitter.split_text(text)