In [1]:
import getpass
import pprint
import os

from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

from openai import OpenAI

In [2]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, Docx2txtLoader, DirectoryLoader, UnstructuredWordDocumentLoader, UnstructuredExcelLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain_iris import IRISVector

In [3]:
import os
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document  # For reading .docx files

# Function to extract text from a Word document
def load_word_document(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text

In [4]:
import pandas as pd

In [5]:
client = OpenAI()
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [None]:
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
# Under the hood, this becomes a SQL table. CANNOT have '.' in the name

In [None]:
loader = DirectoryLoader('data', glob='*.docx', loader_cls=Docx2txtLoader)
docs = loader.load()
len(docs)

In [None]:
text_splitter = SpacyTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(docs)

In [None]:
COLLECTION_NAME = "cancer_db"
# This creates a persistent vector store (a SQL table). You should run this ONCE only
db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [None]:
COLLECTION_NAME = "cancer_db"
# Subsequent calls to reconnect to the database and make searches should use this.  
db = IRISVector(
    embedding_function=embeddings,
    dimension=1536,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [6]:
COLLECTION_NAME = "cancer_db"
# Directory containing multiple Word documents
folder_path = "data"  # Change this to your actual folder path

# List all .docx files in the folder
word_files = [f for f in os.listdir(folder_path) if f.endswith(".docx") and 'knowledge' not in f]

# Initialize ChromaDB client
chromadb_client = chromadb.PersistentClient(path="./chroma_db")
db = chromadb_client.get_or_create_collection(name=COLLECTION_NAME)

# Text splitter for chunking documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250)

# Process each Word document
doc_id = 0
for file_name in word_files:
    file_path = os.path.join(folder_path, file_name)
    print(f"Processing: {file_name}")
    
    # Load and split text
    document_text = load_word_document(file_path)
    doc_chunks = text_splitter.split_text(document_text)
    
    # Generate embeddings
    actual_embeddings = embeddings.embed_documents(doc_chunks)

    # Add to ChromaDB
    db.add(
        ids=[f"{doc_id}_{i}" for i in range(len(doc_chunks))],  # Unique IDs
        documents=doc_chunks,  # Text chunks
        embeddings=actual_embeddings  # Corresponding embeddings
    )

    doc_id += 1

print(f"Successfully added {len(word_files)} documents to ChromaDB!")


Processing: s1.docx
Processing: s6.docx
Processing: s7.docx
Processing: s4.docx
Processing: s8.docx
Processing: s10.docx
Processing: s9.docx
Processing: s5.docx
Processing: s2.docx
Processing: s3.docx
Successfully added 10 documents to ChromaDB!


In [7]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 19


In [8]:
f = open("data/s_test.txt", "r", encoding='ISO-8859-1')
# query = "new technology"
scenario = f.read()

f = open("data/knowledge.docx", "r", encoding='ISO-8859-1')
# query = "new technology"
knowledge = f.read()

In [None]:
docs_with_score = db.similarity_search_with_score(scenario, 2)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

In [None]:
embedding_vector = embeddings.embed_query(scenario)
res = db.similarity_search_by_vector(embedding_vector)
res

In [9]:
# Perform a similarity search
query_embedding = embeddings.embed_query(scenario)  # Generate embedding for the query

# Retrieve top 5 most similar results
results = db.query(
    query_embeddings=[query_embedding],  # Query embedding
    n_results=5  # Number of similar documents to retrieve
)

# Print the search results
for i, doc in enumerate(results["documents"][0]):
    print(f"Result {i+1}: {doc}")
    print(f"Score: {results['distances'][0][i]}\n")


Result 1: Premise: Pt has a big tumor but cannot accept mastectomy, discuss neoadjuvant chemotherapy- risks and benefits and agreeable for trial of nact before mastectomy
Patient Scenario
Dr. Tan: Good morning, Ms. Chua. Thank you for coming in today. I understand that you’ve been thinking a lot about your treatment options. How are you feeling?
Ms. Chua: Good morning, Dr. Tan. I’m very worried. I know my tumor is big, but I just cannot accept the idea of losing my breast. I don’t want a mastectomy if there’s any way to avoid it.
Dr. Tan: I understand how difficult this decision can be. Since your tumor is large, a mastectomy is typically the recommended approach. However, in some cases, we can try neoadjuvant chemotherapy (NACT) first. This means giving chemotherapy before surgery to shrink the tumor, potentially allowing for breast-conserving surgery instead of a mastectomy.
Ms. Chua: Really? So there’s a chance I might not need a mastectomy if the tumor shrinks enough?
Dr. Tan: Yes,

In [None]:
full_res = ''
for each_res in res:
    full_res = full_res + '\n\n' +each_res.page_content

In [10]:
full_res = ''
for each_res in results['documents'][0]:
    full_res = full_res + '\n\n' +each_res

In [11]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system", 
            "content": 
                f"""
                A medical doctor with domain knowledge in breast cancer after having trained with a wealth of knowledge in these topics: {knowledge}.
                Augment your data with results from {full_res}
                """
        },
        {
            "role": "user",
            "content": 
                f"""
                Given patient's consultation with the doctor in this {scenario}, 
                recommend 
                1. the best course of treatment
                2. provide justifications for the course
                3. provide chain of thought to reach those justifications
                4. highlight risks to patient
                """
        }
    ]
)


In [12]:
pprint.pp(completion.choices[0].message.content)

('1. **Recommended Course of Treatment:**\n'
 '   - The best course of treatment for Ms. Chua is to pursue neoadjuvant '
 'chemotherapy (NACT) followed by a reassessment for breast-conserving surgery '
 '(lumpectomy) if the tumor shrinks adequately.\n'
 '\n'
 '2. **Justifications for the Course:**\n'
 '   - **Opportunity for Tumor Reduction:** The primary justification for NACT '
 'is to shrink the tumor size before surgery. Given that Ms. Chua is concerned '
 'about losing her breast, this approach provides her with a chance to '
 'maintain breast integrity if the tumor responds well to chemotherapy.\n'
 '   - **Monitoring Response:** By using NACT, the oncologist can monitor how '
 'well the tumor responds to treatment. A favorable response can provide '
 "valuable information on the tumor's biology, which can guide future "
 'treatment decisions.\n'
 '   - **Psychological Consideration:** Understanding her concerns about '
 'mastectomy, this approach allows Ms. Chua to feel empowere

In [13]:
loader = CSVLoader('data/treatment_selection.csv')#, csv_args={'fieldnames':['']})
docs = loader.load()
len(docs)

12

In [None]:
COLLECTION_NAME = "pictures_db"
# This creates a persistent vector store (a SQL table). You should run this ONCE only
db1 = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [14]:
treatment_selection = pd.read_csv('data/treatment_selection.csv')
treatment_selection['content'] = treatment_selection['surgery_type'] + ' ' + treatment_selection['benefit'] + ' ' + treatment_selection['consideration'] + ' ' + treatment_selection['tag']
texts = treatment_selection['content'].dropna().tolist()  # Remove NaN values and convert to a list

In [15]:
COLLECTION_NAME = "pictures_db"

db2 = chromadb_client.get_or_create_collection(name=COLLECTION_NAME)

# Text splitter for chunking documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Process text into chunks and embeddings
doc_id = 0
for text in texts:
    chunks = text_splitter.split_text(text)  # Split text into smaller chunks
    actual_embeddings = embeddings.embed_documents(chunks)  # Generate embeddings
    
    # Add chunks to ChromaDB
    db2.add(
        ids=[f"{doc_id}_{i}" for i in range(len(chunks))],  # Unique IDs
        documents=chunks,  # Text chunks
        embeddings=actual_embeddings  # Corresponding embeddings
    )
    
    doc_id += 1

print(f"Successfully added {len(texts)} rows (split into chunks) to ChromaDB!")

Successfully added 12 rows (split into chunks) to ChromaDB!


In [None]:
docs_with_score = db1.similarity_search_with_score(completion.choices[0].message.content, 1)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

In [16]:
# Perform a similarity search
query_embedding = embeddings.embed_query(completion.choices[0].message.content)  # Generate embedding for the query

# Retrieve top 5 most similar results
results = db2.query(
    query_embeddings=[query_embedding],  # Query embedding
    n_results=1  # Number of similar documents to retrieve
)

# Print the search results
for i, doc in enumerate(results["documents"][0]):
    print(f"Result {i+1}: {doc}")
    print(f"Score: {results['distances'][0][i]}\n")


Result 1: lumpectomy small cut; shorter recovery; conserve shape; suitable for early-stage additional radiotherapy cost; possibility of 2nd surgery; longer treatment process img1
Score: 0.36170491630158985



In [None]:
image_chosen = doc.page_content.split('\n')[-1].split(': ')[-1] + ".jpeg"

In [None]:
image_chosen

'lumpectomy small cut; shorter recovery; conserve shape; suitable for early-stage additional radiotherapy cost; possibility of 2nd surgery; longer treatment process img1'

In [17]:
image_chosen = doc.split(' ')[-1] + ".jpeg"

In [18]:
image_chosen

'img1.jpeg'

# Misc

In [None]:
# completion = client.chat.completions.create(
#     model="gpt-4o-mini",
#     messages=[
#         {
#             "role": "system", 
#             "content": """
#             Factors For Lumpectomy:
# Breast Conservation – Preserves the natural breast, which may be important for some women, though Mdm Ang has indicated she may be able to accept a flat chest.
# Less Invasive Surgery – Typically, a shorter recovery time compared to a mastectomy.
# Factors Against Lumpectomy:
# Need for Frequent Follow-Ups – Requires post-surgical radiotherapy, which means multiple hospital visits, a significant concern for Mdm Ang since she finds it difficult to travel to the hospital frequently.
# Risk of Second Surgery – If the lumpectomy does not achieve clear margins, a second surgery may be required, which Mdm Ang wants to avoid.
# Overall Treatment Burden – The combination of surgery and radiotherapy means a longer treatment course, which may not be ideal given her preference for a one-time treatment.
# Since Mdm Ang prioritizes minimizing hospital visits and avoiding the possibility of a second surgery, mastectomy without reconstruction aligns better with her needs.
# """},
#         {
#             "role": "user",
#             "content": f"""
# 1) cannot accept mastectomy
# 2) wants reconstruction but cannot accept implant - can accept tram or LD flap 
# 3) only wants lumpectomy ok for second surgery
# 4) tumor too big but really only want breast conserving and considering oncoplastic surgery - accepting of a slightly longer scar to maintain symmetry of best 
# 5) big tumor but cannot accept mastectomy, discuss neoadjuvant chemotherapy- risks and benefits and agreeable for trial of nact before mastectomy
# 6) cost concerns. 
# Deciding between Breast conserving but with radiotherapy versus mastectomy without recon 
# Recon too expensive
# 7) concern about drain management and no caregiver - prefers fast recovery - lumpectomy 
# 8 ) doesn’t want radiotherapy strongly  - mastectomy the.
#             ."""
#         }
#     ]
# )

In [None]:
# from langchain.text_splitter import CharacterTextSplitter

# text = "Your long document text here..."

# splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=1000,
#     chunk_overlap=200
# )

# chunks = splitter.split_text(text) #you can also split documents using split_documents

# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text = "Your long document text here..."

# splitter = RecursiveCharacterTextSplitter(
#     separators=["\n\n", "\n", " ", ""],
#     chunk_size=1000,
#     chunk_overlap=200,
#     length_function=len
# )

# chunks = splitter.split_text(text)