In [1]:
from dotenv import load_dotenv
from datasets import load_dataset
import chromadb
import openai
import os 

## 1 Load up Vector DB 

In [6]:
def get_hugging_face_ds(name, split_name):
  dataset = load_dataset(name, split=split_name)
  return dataset

CHROMA_CLIENT = None
def get_chroma_client():
  global CHROMA_CLIENT
  if CHROMA_CLIENT is None:
    CHROMA_CLIENT = chromadb.Client()
  return CHROMA_CLIENT

def load_dataset_into_chroma(collection_name, dataset, column_name, sample=None):
  chroma_client = get_chroma_client()
  collection = chroma_client.create_collection(collection_name)
  if sample:
    collection.add(
      ids=[str(i) for i in range(0, sample)],  # IDs are just strings
      documents=dataset[column_name][:sample]
    )
  else:
    collection.add(
      ids=[str(i) for i in range(0, len(dataset))],  # IDs are just strings
      documents=dataset[column_name][:sample]
    )

  return collection

In [7]:
dataset = get_hugging_face_ds('FedML/PubMedQA_instruction',split_name='test')

Found cached dataset parquet (/Users/sharansankar/.cache/huggingface/datasets/FedML___parquet/FedML--PubMedQA_instruction-acdd9fbab1be4b17/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [8]:
dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 1000
})

In [9]:
dataset['instruction'][:5]

['Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'Landolt C and snellen e acuity: differences in strabismus amblyopia?',
 'Syncope during bathing in infants, a pediatric form of water-induced urticaria?',
 'Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?',
 'Can tailored interventions increase mammography use among HMO women?']

In [10]:
chroma_client = get_chroma_client()

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


In [11]:
vector_db = load_dataset_into_chroma(
    collection_name='pubmed_qa_context',
    dataset=dataset,
    column_name='context',
    sample=100
)

## 2 Load up LLM Client

In [10]:
os.getenv("OPENAI_API_KEY")

In [16]:
load_dotenv(dotenv_path="../.env")

True

In [2]:
def init_open_ai():
  load_dotenv(dotenv_path="../.env")
  openai.organization = os.getenv("OPENAI_ORG_KEY")
  openai.api_key = os.getenv("OPENAI_API_KEY")
  return


In [3]:
def get_llm_client():
  if openai.organization is None or openai.api_key is None:
    init_open_ai()
  return OpenAILLMClient()

class OpenAILLMClient:
  def __init__(self, model_name="gpt-3.5-turbo"):
    self.model_name = model_name

  def get_response_from_prompt(self, prompt, prev_messages=None, role='user'):
    if prev_messages:
      prev_messages.append({
          'role': role,
          'content': prompt
      })
    else:
      prev_messages = [
        {
          'role': role,
          'content': prompt
        }
      ]
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=prev_messages
    )
    return response

In [4]:
open_ai_client = get_llm_client()

In [5]:
open_ai_client.get_response_from_prompt("hello what is your name?") 

<OpenAIObject chat.completion id=chatcmpl-89dlpxlCibCsGFp64SGZAo0LMhsWy at 0x7fce52baebd0> JSON: {
  "id": "chatcmpl-89dlpxlCibCsGFp64SGZAo0LMhsWy",
  "object": "chat.completion",
  "created": 1697308521,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Hello! I am an AI language model developed by OpenAI called GPT-3. I don't have a personal name since I am an AI program. How can I assist you today?"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 13,
    "completion_tokens": 39,
    "total_tokens": 52
  }
}

## 3 Creating Lookup Functionality 

In [12]:
def lookup_vector_db_for_context(lookup_text, vector_db, num_samples=3):
  results = vector_db.query(
    query_texts=lookup_text,
    n_results=num_samples
  )
  return results

In [14]:
query_response = lookup_vector_db_for_context(
  "what is the common cold?", 
  vector_db=vector_db, 
  num_samples=3
)

In [16]:
query_response.keys()

dict_keys(['embeddings', 'documents', 'ids', 'metadatas', 'distances'])

In [17]:
query_response['documents']

  'To examine patterns of knowledge and attitudes among adults aged>65 years unvaccinated for influenza. Surveyed Medicare beneficiaries in 5 areas; clustered unvaccinated seniors by their immunization related knowledge and attitudes. Identified 4 clusters: Potentials (45%) would receive influenza vaccine to prevent disease; Fearful Uninformeds (9%) were unsure if influenza vaccine causes illness; Doubters (27%) were unsure if vaccine is efficacious; Misinformeds (19%) believed influenza vaccine causes illness. More Potentials (75%) and Misinformeds (70%) ever received influenza vaccine than did Fearful Uninformeds (18%) and Doubters (29%).',
  'A short course of systemic corticosteroids is an important therapy in the treatment of pediatric asthma exacerbations. Although a 5-day course of oral prednisone or prednisolone has become the most commonly used regimen, dexamethasone has also been used for a shorter duration (1-2 days) with potential for improvement in compliance and palatabil

In [46]:
type(query_response)

dict

In [36]:
dir(query_response)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [40]:
query_response.keys()

dict_keys(['embeddings', 'documents', 'ids', 'metadatas', 'distances'])

In [43]:
query_response = lookup_vector_db_for_context(
  "what is the common cold?", 
  vector_db=vector_db, 
  num_samples=2
)

In [45]:
query_response['documents'][0]

 'To examine patterns of knowledge and attitudes among adults aged>65 years unvaccinated for influenza. Surveyed Medicare beneficiaries in 5 areas; clustered unvaccinated seniors by their immunization related knowledge and attitudes. Identified 4 clusters: Potentials (45%) would receive influenza vaccine to prevent disease; Fearful Uninformeds (9%) were unsure if influenza vaccine causes illness; Doubters (27%) were unsure if vaccine is efficacious; Misinformeds (19%) believed influenza vaccine causes illness. More Potentials (75%) and Misinformeds (70%) ever received influenza vaccine than did Fearful Uninformeds (18%) and Doubters (29%).']

In [31]:
def get_prompt_template(prompt_question, context, prompt_template=PROMPT_RESPONSE_TEMPLATE):
  return prompt_template.format(prompt=prompt_question, context='\n'.join(context))

## 4 Prompt Template Generator 

In [18]:
PROMPT_RESPONSE_TEMPLATE = """
You are a medical expert. Answer the following prompt, given the following context to use as your aid

prompt: {prompt}

context:
{context}
"""

In [19]:
print(PROMPT_RESPONSE_TEMPLATE.format(prompt=' what is medicine? ', context=' medicine is fun!'))


You are a medical expert. Answer the following prompt, given the following context to use as your aid

prompt:  what is medicine? 

context:
 medicine is fun!



In [21]:
test_prompt = "what is medicine?"

In [22]:
query_response = lookup_vector_db_for_context(test_prompt, vector_db=vector_db, num_samples=1)

In [23]:
query_response_texts = query_response['documents'][0]

In [24]:
query_response_texts



In [25]:
test_prompt = PROMPT_RESPONSE_TEMPLATE.format(prompt=' what is medicine? ', context='\n'.join(query_response_texts))

In [26]:
print(test_prompt)


You are a medical expert. Answer the following prompt, given the following context to use as your aid

prompt:  what is medicine? 

context:



In [27]:
open_ai_client.get_response_from_prompt(test_prompt)

<OpenAIObject chat.completion id=chatcmpl-89e59MKdCIHWl81M8V4oNEqIhphV1 at 0x7fce1c495900> JSON: {
  "id": "chatcmpl-89e59MKdCIHWl81M8V4oNEqIhphV1",
  "object": "chat.completion",
  "created": 1697309719,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 184,
    "completion_tokens": 404,
    "total_tokens": 588
  }
}

## 5 RAG-Based Prompt and Response System

In [32]:
class RAGBasedPromptSystem:
  def __init__(self, vector_db, llm_client, lookup_samples=3):
    self.vector_db = vector_db
    self.llm_client = llm_client
    self.lookup_samples = lookup_samples

  def _parse_context_texts(self, query_response):
    return query_response['documents'][0]

  def _parse_llm_response(self, llm_response):
    return llm_response['choices'][0]['message']['content']

  def get_response(self, prompt, test=True):
    # 1 get context docs
    query_lookup = lookup_vector_db_for_context(prompt, vector_db=self.vector_db, num_samples=self.lookup_samples)
    context_texts = self._parse_context_texts(query_lookup)

    # 2 generate LLM prompt
    prompt_text = get_prompt_template(prompt_question=prompt, context=context_texts)

    if test:
      print(prompt_text)

    # 3 get LLM response
    llm_response = self.llm_client.get_response_from_prompt(prompt_text)
    llm_response_text = self._parse_llm_response(llm_response)
    return llm_response_text

In [33]:
rag_prompt_system = RAGBasedPromptSystem(vector_db=vector_db, llm_client=open_ai_client)

In [34]:
rag_prompt_system.get_response('what is medicine?',test=True)


You are a medical expert. Answer the following prompt, given the following context to use as your aid

prompt: what is medicine?

context:
It is commonly accepted that pathological gambling results from the interaction of multiple risk factors. Among these, dopamine replacement therapy (DRT) prescribed for Parkinson disease can be cited. Another dopamine agonist, aripiprazole, could be a new risk factor. We decided to explore this potential adverse drug reaction (ADR). Based on a cohort of 166 pathological gamblers starting treatment in our department, data of each of the 8 patients treated by aripiprazole at inclusion were analyzed. The patients involved were schizophrenic or bipolar, mostly young men with a history of addictive disorders and regular gambling prior to the prescription of aripiprazole. For each one of them, the causality of aripiprazole was considered, using an algorithm. The probability that pathological gambling is actually due to aripiprazole is "possible" in 7 cas

"Medicine is a branch of healthcare that focuses on the diagnosis, treatment, and prevention of diseases and injuries. It encompasses a wide range of practices, including the use of medications, surgeries, therapies, and preventive measures. \n\nIn the given context, the first study examines the use and delivery of cough and cold medicines in children under 6 years old in an inner-city pediatric emergency department. The study found that a majority of caregivers (82%) stated that they would treat their children with cough or cold medicines, but a significant portion (72%) incorrectly dosed the medication. This highlights the importance of proper administration and dosing of medications in pediatric patients.\n\nIn the second context, the study explores the potential adverse drug reaction (ADR) of pathological gambling in patients prescribed with dopamine replacement therapy (DRT) for Parkinson's disease. Aripiprazole, a dopamine agonist, is identified as a new potential risk factor. Th

In [47]:
from enum import Enum

In [48]:
class test(Enum):
    ASD = "asd"
    SSDF = "ASDF"

In [50]:
test.ASD.value

'asd'