In [1]:
# pip install qdrant-client groq sentence-transformers dspy-ai fastembed gradio --upgrade

## Dataset

In [2]:
import pandas as pd
df = pd.read_csv("healthcare_dataset.csv")

In [3]:
df[:5]

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [4]:
# Function to format each row into a single string
def format_row(row):
    return (
        f"Name: {row['Name']}, Age: {row['Age']}, Gender: {row['Gender']}, "
        f"Blood Type: {row['Blood Type']}, Medical Condition: {row['Medical Condition']}, "
        f"Date of Admission: {row['Date of Admission']}, Doctor: {row['Doctor']}, "
        f"Hospital: {row['Hospital']}, Insurance Provider: {row['Insurance Provider']}, "
        f"Billing Amount: {row['Billing Amount']}, Room Number: {row['Room Number']}, "
        f"Admission Type: {row['Admission Type']}, Discharge Date: {row['Discharge Date']}, "
        f"Medication: {row['Medication']}, Test Results: {row['Test Results']}"
        "\n\n".lower()
    )

# Apply the function to each row and create a new column with the formatted text
df['formatted_text'] = df.apply(format_row, axis=1)

# Convert the formatted text into a list (or any other format you need)
text_data = df['formatted_text'].tolist()

In [5]:
from random import shuffle
sampled_dataset = text_data[:128]
shuffle(sampled_dataset)

In [6]:
sampled_dataset[:5]

['name: erica myers, age: 19, gender: female, blood type: a+, medical condition: arthritis, date of admission: 2020-01-30, doctor: ian benitez, hospital: jackson and lane, dillon, insurance provider: medicare, billing amount: 20227.86372674477, room number: 241, admission type: elective, discharge date: 2020-02-12, medication: penicillin, test results: normal\n\n',
 'name: rebecca parsons, age: 37, gender: female, blood type: o+, medical condition: asthma, date of admission: 2019-12-02, doctor: steven adams, hospital: group white, insurance provider: aetna, billing amount: 8521.214252671094, room number: 476, admission type: elective, discharge date: 2019-12-13, medication: penicillin, test results: abnormal\n\n',
 'name: lindsey lambert, age: 82, gender: female, blood type: a+, medical condition: hypertension, date of admission: 2021-11-19, doctor: christopher guerra, hospital: and brown oneal, shah, insurance provider: medicare, billing amount: 23067.672165245425, room number: 307, a

## Encode Texts

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-large-en-v1.5", device='cuda')

In [8]:
vectors = model.encode(sampled_dataset)

In [9]:
vectors[0].shape

(1024,)

## Qdrant Cloud

In [10]:
import os
os.environ['QDRANT__SERVICE__API_KEY']="qdrant_api_key"
os.environ['QDRANT__SERVICE__JWT_RBAC']='true'

In [11]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# Initialize the client

client = QdrantClient(
    url="https://b3cc6dd2-4f57-4e59-8799-e66f63cccca3.us-east4-0.gcp.cloud.qdrant.io:6333",
    # url='http://localhost:6333',
    api_key=os.environ['QDRANT__SERVICE__API_KEY'],
)

In [12]:
client.recreate_collection(
    collection_name="phi_data",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

client.upload_collection(
    collection_name="phi_data",
    ids=[i for i in range(len(sampled_dataset))],
    vectors=vectors,
    parallel=4,
    max_retries=3,
)

  client.recreate_collection(


In [None]:
def get_context(text):
    query_vector = model.encode(text)

    hits = client.search(
        collection_name="phi_data",
        query_vector=query_vector,
        limit=3  # Return 5 closest points
    )
    s=''
    for x in [sampled_dataset[i.id] for i in hits]:
        s = s + x
    return s

## Dspy pipeline

In [15]:
from dspy.retrieve.qdrant_rm import QdrantRM

In [16]:
qdrant_retriever_model = QdrantRM("phi_data", client, k=3)

In [17]:
import dspy
llama3 = dspy.GROQ(model='llama3-8b-8192', api_key ="<groq_api_key>" )

In [18]:
dspy.settings.configure(rm=qdrant_retriever_model, lm=llama3)

class GenerateAnswer(dspy.Signature):
    """Answer questions with logical factoid answers."""

    context = dspy.InputField(desc="will contain PHI medical data of patients matched with the query")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="an answer between 10 to 20 words")

In [19]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = get_context(question)
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

## Test Ineference 

In [20]:
uncompiled_rag = RAG()

In [21]:
def respond(query):
    response = uncompiled_rag(query)
    return response.answer

In [22]:
respond("steven james")

'Steven James is the doctor mentioned in the patient data.'

In [23]:
llama3.inspect_history(n=1)




Answer questions with logical factoid answers.

---

Follow the following format.

Context: will contain PHI medical data of patients matched with the query

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: an answer between 10 to 20 words

---

Context:
name: mark ford, age: 18, gender: male, blood type: b+, medical condition: asthma, date of admission: 2022-10-18, doctor: steven james, hospital: luna inc, insurance provider: unitedhealthcare, billing amount: 28837.6770525072, room number: 227, admission type: elective, discharge date: 2022-11-11, medication: aspirin, test results: abnormal

name: catherine gardner, age: 79, gender: female, blood type: a-, medical condition: hypertension, date of admission: 2019-08-19, doctor: david ruiz, hospital: james ltd, insurance provider: medicare, billing amount: 25503.673806852043, room number: 144, admission type: elective, discharge date: 2019-08-26, medication: lipitor, test r

"\n\n\nAnswer questions with logical factoid answers.\n\n---\n\nFollow the following format.\n\nContext: will contain PHI medical data of patients matched with the query\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: an answer between 10 to 20 words\n\n---\n\nContext:\nname: mark ford, age: 18, gender: male, blood type: b+, medical condition: asthma, date of admission: 2022-10-18, doctor: steven james, hospital: luna inc, insurance provider: unitedhealthcare, billing amount: 28837.6770525072, room number: 227, admission type: elective, discharge date: 2022-11-11, medication: aspirin, test results: abnormal\n\nname: catherine gardner, age: 79, gender: female, blood type: a-, medical condition: hypertension, date of admission: 2019-08-19, doctor: david ruiz, hospital: james ltd, insurance provider: medicare, billing amount: 25503.673806852043, room number: 144, admission type: elective, discharge date: 2019-08-26, medi

## Gradio UI

In [24]:
import gradio as gr


with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    
    def respond(query, chat_history):
        response = uncompiled_rag(query)
        chat_history.append((query, response.answer))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot]) 

In [25]:
demo.launch()
# demo.launch(share=True) if using colab

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [28]:
sampled_dataset[67]

'name: kayla padilla, age: 63, gender: male, blood type: b-, medical condition: asthma, date of admission: 2020-09-24, doctor: jermaine peters, hospital: and morales jennings cook,, insurance provider: aetna, billing amount: 34774.351127057584, room number: 152, admission type: emergency, discharge date: 2020-10-08, medication: penicillin, test results: inconclusive\n\n'