# Installation

In [None]:
!pip install -r requirements.txt

In [None]:
pip install -i https://pypi.org/simple/ bitsandbytes

In [2]:
import importlib.metadata
import pickle
import sys

print('Python version:', sys.version)

Python version: 3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]


In [3]:
def print_package_version(package_name):
    try:
        version = importlib.metadata.version(package_name)
        print(f"{package_name} : {version}")
    except importlib.metadata.PackageNotFoundError:
        print(f"{package_name} is not installed.")

packages = [
    "llama-index",
    "langchain",
    "llama-index-embeddings-huggingface",
    "llama_index.embeddings.langchain"
]

for package in packages:
    print_package_version(package)

llama-index : 0.10.30
langchain : 0.1.16
llama-index-embeddings-huggingface : 0.2.0
llama_index.embeddings.langchain : 0.1.2


# Loading Documents

Domain knowledge:

In [2]:
import pickle

with open('domain_knowledge.pkl', 'rb') as file:
    domain_know = pickle.load(file)

# Print each key
for key in domain_know:
    print(key)

1. Irregular menstruation
2. Sore throat
3. Gastrointestinal disorders
4. Bowel sounds
5. Helicobacter infection
6. Drowsiness
7. Anemia
8. Numbness of limbs
9. Stomach pain
10. Cramps
11. Perianal pain
12. Loose stools
13. Swollen lymph nodes
14. Bacterial infection
15. Dysbiosis
16. Shortness of breath
17. Diarrhea, abdominal pain and bloating
18. Frequency and urgency of urination
19. Bitter mouth
20. Allergies
21. Indigestion
22. Hemorrhoids
23. Intestinal obstruction
24. Melena
25. Edema
26. Dehydration
27. Blood in the stool
28. Heart palpitations
29. Chest pain
30. Chest pain
31. Jaundice
32. Reflux


Disease List:

In [3]:
with open('disease_list.pkl', 'rb') as file:
    disease_list = pickle.load(file)

print(disease_list)

['Hunger', 'Abdominal bloating', 'Anemia', 'Bacterial infection', 'Weight loss', 'Loss of appetite', 'Mucus in stool', 'Menstrual irregularities', 'Diarrhea', 'Spasm', 'Throat burning sensation', 'Chest pain', 'Phlegm', 'Difficulty breathing', 'Jaundice', 'Hiccup', 'Sneeze', 'Hemorrhoids', 'Loose stools', 'Frequent urination', 'Headache', 'Urgent urination', 'Gastrointestinal discomfort', 'Shortness of breath', 'Chills', 'Heartburn', 'Sore throat', 'Nasal congestion', 'Indigestion', 'Back pain', 'Stomach ache', 'Perianal pain', 'Fatigue', 'Allergy', 'Bowel sounds', 'Cough', 'Dark stools', 'Vomiting', 'Muscle soreness', 'Dehydration', 'Intestinal obstruction', 'Nausea', 'Numbness in limbs', 'Fever', 'Restlessness', 'Vomiting blood', 'Dysbiosis', 'Drowsiness', 'Dizziness', 'Abdominal pain', 'Rectal bleeding', 'Palpitations', 'Mental fatigue', 'Helicobacter pylori infection', 'Reflux', 'Edema', 'Gastrointestinal dysfunction', 'Enlarged lymph nodes', 'Difficulty swallowing', 'Bitter taste 

Creating the Diagnostic Questions:

In [4]:
def diagnostic_question_gen(disease_list):

  diagnostic_qs = [
      f"Does the person described in the case have {disease} symptoms? Do you think it is serious?"
      for disease in disease_list
  ]
  return diagnostic_qs

diagnostic_queestions = diagnostic_question_gen(disease_list)
diagnostic_queestions[0]

'Does the person described in the case have Hunger symptoms? Do you think it is serious?'

# Retrieval

2- Retriving the most related domain knowledge:

In [8]:
from sentence_transformers import SentenceTransformer

# Diagnostic question
diag_q = diagnostic_queestions[0]

# Number of similar kownledge paragraphs
k = 2

# Embedding model
model = SentenceTransformer("all-mpnet-base-v2")

## Cosine Similarity based Retrieval

In [86]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def retrieval_know_cosine(diag_q, domain_know, k):

    # Encode the diagnostic question
    diag_q_emb = model.encode(diag_q)

    # Prepare to store domain knowledge embeddings and their corresponding keys
    domain_know_embs = []
    symptom_names = list(domain_know.keys())

    # Encode each paragraph of domain knowledge
    for symptom in symptom_names:
        emb = model.encode(domain_know[symptom])
        domain_know_embs.append(emb)

    # Calculate cosine similarity between the diagnostic question and each domain knowledge embedding
    similarities = cosine_similarity([diag_q_emb], domain_know_embs)[0]

    # Get the indices of the top-k most similar paragraphs
    top_indices = similarities.argsort()[-k:][::-1]

    # Retrieve the top-k most similar paragraphs using the indices
    top_symptoms = [(symptom_names[idx], domain_know[symptom_names[idx]], similarities[idx]) for idx in top_indices]

    return top_symptoms


In [87]:
results1 = retrieval_know_cosine(diag_q, domain_know, k)

for symptom, text, similarity in results1:
    print(f"Symptom: {symptom}, Similarity: {similarity}\nText: {text}\n")

Symptom: 3. Gastrointestinal disorders, Similarity: 0.45307421684265137
Text: It is a type of gastrointestinal disease. Some patients often do not pay enough attention to it and do not go to the hospital for diagnosis and treatment in time, causing great harm to themselves. So, what are gastrointestinal disorders? What are the symptoms of gastrointestinal disorders? A doctor from Chongqing Dongda Anorectal Hospital said that gastrointestinal dysfunction, also known as gastrointestinal neurosis, is a functional disease. It is generally believed that mental factors are the main causes of this disease, such as emotional stress, anxiety, difficulties in life and work, worries, accidents, etc., which can lead to abnormal gastrointestinal function and further cause gastrointestinal dysfunction. Most gastrointestinal disorders have a slow onset, often last for years, and may be persistent or recurring. People suffering from gastrointestinal disorders often present with acid reflux, belching, 

## Hopfield Retrieval Model

In [5]:
from hflayers import Hopfield, HopfieldPooling, HopfieldLayer
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import torch.nn as nn
import torch

class HopfieldRetrievalModel(nn.Module):
    def __init__(self, beta=0.125, update_steps_max=3):
        super(HopfieldRetrievalModel, self).__init__()
        self.hopfield = Hopfield(
            scaling=beta,
            update_steps_max=update_steps_max,
            update_steps_eps=1e-5,
            state_pattern_as_static=True,
            stored_pattern_as_static=True,
            pattern_projection_as_static=True,
            normalize_stored_pattern=False,
            normalize_stored_pattern_affine=False,
            normalize_state_pattern=False,
            normalize_state_pattern_affine=False,
            normalize_pattern_projection=False,
            normalize_pattern_projection_affine=False,
            disable_out_projection=True)

    def forward(self, memory, trg):
        memory = torch.unsqueeze(memory, 0)
        trg = torch.unsqueeze(trg, 0)
        output = self.hopfield((memory, trg, memory))
        output = output.squeeze(0)
        memories = memory.squeeze(0)
        pair_list = F.normalize(output) @ F.normalize(memories).t()
        return pair_list

In [6]:
def retrieval_know_hopfield(diag_q, domain_know, k):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    retrieval_model = HopfieldRetrievalModel().to(device)
    
    # Encode the diagnostic question and move to device
    diag_q_emb = torch.tensor(model.encode([diag_q])).to(device)
    
    # Encode domain knowledge and move to device
    domain_know_embs = [torch.tensor(model.encode([domain_know[symptom]])) for symptom in domain_know.keys()]
    domain_know_embs = torch.stack(domain_know_embs).squeeze(1).to(device)
    
    # Calculate the retrieval scores using the Hopfield model
    result = retrieval_model(domain_know_embs * 100, diag_q_emb * 100)
    input_ids = torch.topk(result, k, dim=1).indices.squeeze(0)
    
    # Retrieve the top-k most similar paragraphs
    symptom_names = list(domain_know.keys())
    top_symptoms = [(symptom_names[idx], domain_know[symptom_names[idx]], result[0, idx].item()) for idx in input_ids]
    
    return top_symptoms

In [9]:
results = retrieval_know_hopfield(diag_q, domain_know, k)

for symptom, text, similarity in results:
    print(f"Symptom: {symptom}, Similarity: {similarity}\nText: {text}\n")

Symptom: 3. Gastrointestinal disorders, Similarity: 0.9999999403953552
Text: It is a type of gastrointestinal disease. Some patients often do not pay enough attention to it and do not go to the hospital for diagnosis and treatment in time, causing great harm to themselves. So, what are gastrointestinal disorders? What are the symptoms of gastrointestinal disorders? A doctor from Chongqing Dongda Anorectal Hospital said that gastrointestinal dysfunction, also known as gastrointestinal neurosis, is a functional disease. It is generally believed that mental factors are the main causes of this disease, such as emotional stress, anxiety, difficulties in life and work, worries, accidents, etc., which can lead to abnormal gastrointestinal function and further cause gastrointestinal dysfunction. Most gastrointestinal disorders have a slow onset, often last for years, and may be persistent or recurring. People suffering from gastrointestinal disorders often present with acid reflux, belching, a

# Prompt generation

In [83]:
def generate_prompt(patient_id, domain_know, diag_q, k):
    patient_id_str = str(patient_id)
    # Construct the file path
    sample_report_path = f"./dataset_folder/health_report_{{{patient_id_str}}}/health_report_{{{patient_id_str}}}.txt"
    print("Prompt generating for:", sample_report_path)  # Debugging line

    # Read the medical report from the file
    try:
        with open(sample_report_path, 'r', encoding='utf-8') as file:
            medical_report = file.read()
    except FileNotFoundError:
        return "Error: The medical report file was not found."
    except Exception as e:
        return f"Error: {str(e)}"

    related_documents = retrieval_know_hopfield(diag_q, domain_know, k)
    related_knowledge = "\n".join([f"{doc[0]}: {doc[1]}" for doc in related_documents])

    # Assemble the prompt
    prompt = (
        "Here is some additional professional health knowledge that can help you better analyze the report:\n"
        "----------------------------------------------------------------------\n"
        f"{related_knowledge}\n"
        "----------------------------------------------------------------------\n"
        "This is a patient’s medical record. Context information:\n"
        "----------------------------------------------------------------------\n"
        f"{medical_report}\n"
        "----------------------------------------------------------------------\n"
        "Given the context and health knowledge, answer the below question by only one answer in JSON format with only one floating point number between 0 and 1 that is “score”. :\n"
        f"{diag_q}\n"
        "The rule of the JSON answer: 0-0.2 is mild or none, 0.3-0.6 is moderate, and above 0.7 is severe.\n"
    )

    return prompt


Sample usage:

In [None]:
# Example usage
patient_id = 0
diag_q = diagnostic_queestions[0]
k = 2

prompt = generate_prompt(patient_id, domain_know, diag_q, k)
print(prompt)

Over the whole dataset:

In [10]:
def generate_prompt(medical_report, domain_know, diag_q, k):
    # Retrieve related documents using the Hopfield model
    related_documents = retrieval_know_hopfield(diag_q, domain_know, k)
    related_knowledge = "\n".join([f"{doc[0]}: {doc[1]}" for doc in related_documents])

    # Assemble the prompt
    prompt = (
        "Here is some additional professional health knowledge that can help you better analyze the report:\n"
        "----------------------------------------------------------------------\n"
        f"{related_knowledge}\n"
        "----------------------------------------------------------------------\n"
        "This is a patient’s medical record. Context information:\n"
        "----------------------------------------------------------------------\n"
        f"{medical_report}\n"
        "----------------------------------------------------------------------\n"
        "Given the context and health knowledge, answer the below question by only one answer in JSON format with only one floating point number between 0 and 1 that is “score”. :\n"
        f"{diag_q}\n"
        "The rule of the JSON answer: 0-0.2 is mild or none, 0.3-0.6 is moderate, and above 0.7 is severe.\n"
    )

    return prompt


In [13]:
%%time 
import os
import glob

base_dir = "dataset_folder" 
pattern = os.path.join(base_dir, "health_report_*")
diagnostic_questions = diagnostic_question_gen(disease_list)

prompts = {}

# Iterate over each directory matching the pattern
for directory in glob.glob(pattern):
    file_name = os.path.basename(directory)
    report_path = os.path.join(directory, file_name + ".txt")
    
    # Read the medical report
    with open(report_path, 'r', encoding='utf-8') as file:
        medical_report = file.read()

    print("Prompt generation for:", file_name)  # Debugging line
    
    prompts[file_name] = []
    for diag_q in diagnostic_questions:
        prompt = generate_prompt(medical_report, domain_know, diag_q, k)
        
        # Store or process the prompt as needed
        prompts[file_name].append(prompt)


Prompt generation for: health_report_{0}
Prompt generation for: health_report_{10}
Prompt generation for: health_report_{11}
Prompt generation for: health_report_{12}
Prompt generation for: health_report_{13}
Prompt generation for: health_report_{14}
Prompt generation for: health_report_{15}
Prompt generation for: health_report_{16}
Prompt generation for: health_report_{17}
Prompt generation for: health_report_{18}
Prompt generation for: health_report_{19}
Prompt generation for: health_report_{1}
Prompt generation for: health_report_{20}
Prompt generation for: health_report_{21}
Prompt generation for: health_report_{22}
Prompt generation for: health_report_{23}
Prompt generation for: health_report_{24}
Prompt generation for: health_report_{25}
Prompt generation for: health_report_{26}
Prompt generation for: health_report_{28}
Prompt generation for: health_report_{29}
Prompt generation for: health_report_{2}
Prompt generation for: health_report_{30}
Prompt generation for: health_report_

In [None]:
for key in prompts.keys():
    print(key)

dataset_folder\health_report_{0}\health_report_{0}.txt
dataset_folder\health_report_{10}\health_report_{10}.txt
dataset_folder\health_report_{11}\health_report_{11}.txt
dataset_folder\health_report_{12}\health_report_{12}.txt
dataset_folder\health_report_{13}\health_report_{13}.txt
dataset_folder\health_report_{14}\health_report_{14}.txt
dataset_folder\health_report_{15}\health_report_{15}.txt
dataset_folder\health_report_{16}\health_report_{16}.txt
dataset_folder\health_report_{17}\health_report_{17}.txt
dataset_folder\health_report_{18}\health_report_{18}.txt
dataset_folder\health_report_{19}\health_report_{19}.txt
dataset_folder\health_report_{1}\health_report_{1}.txt
dataset_folder\health_report_{20}\health_report_{20}.txt
dataset_folder\health_report_{21}\health_report_{21}.txt
dataset_folder\health_report_{22}\health_report_{22}.txt
dataset_folder\health_report_{23}\health_report_{23}.txt
dataset_folder\health_report_{24}\health_report_{24}.txt
dataset_folder\health_report_{25}\h

In [16]:
with open( 'generated_prompts.pkl', 'wb') as file:
    pickle.dump(prompts, file)

# Response Generation 

In [19]:
import pickle

with open('generated_prompts.pkl', 'rb') as file: prompts = pickle.load(file)

sample_prompt = prompts['health_report_{0}'][0]
sample_prompt

"Here is some additional professional health knowledge that can help you better analyze the report:\n----------------------------------------------------------------------\n3. Gastrointestinal disorders: It is a type of gastrointestinal disease. Some patients often do not pay enough attention to it and do not go to the hospital for diagnosis and treatment in time, causing great harm to themselves. So, what are gastrointestinal disorders? What are the symptoms of gastrointestinal disorders? A doctor from Chongqing Dongda Anorectal Hospital said that gastrointestinal dysfunction, also known as gastrointestinal neurosis, is a functional disease. It is generally believed that mental factors are the main causes of this disease, such as emotional stress, anxiety, difficulties in life and work, worries, accidents, etc., which can lead to abnormal gastrointestinal function and further cause gastrointestinal dysfunction. Most gastrointestinal disorders have a slow onset, often last for years, a

**If your torch can't detect you GPU, execute this in the terminal:**

- pip uninstall torch torchvision torchaudio
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
pip install gputil

In [10]:
import GPUtil

for gpu in GPUtil.getGPUs():
    print(f"GPU Model: {gpu.name}")
    print(f"Total GPU Memory: {gpu.memoryTotal / 1024:.2f} GB")


GPU Model: NVIDIA GeForce RTX 3060
Total GPU Memory: 12.00 GB


In [11]:
import torch

print("torch + cuda:", torch.__version__) # Check PyTorch version
print("Is cuda avialable:", torch.cuda.is_available())

torch + cuda: 2.2.2+cu121
Is cuda avialable: True


## HuggingFace LLMs

**For some LLMs on HuggingFace like Mistral, you first need to get access approval. So please follow these steps:**

- First go to https://huggingface.co/docs/transformers/en/model_doc/mistral to get access.
- Then go to the HuggingFace account and generate a new access token.

In [12]:
# Set your token here
import os
hf_token = os.getenv('HF_TOKEN')

In [14]:
llms_info = {
    "Mistral-7B": {# Size: 40 GB, system RAM: 19 GB (Windows)
        "remote_model_name": "mistralai/Mistral-7B-v0.1",
        "model_path": "C:/Users/Admin/Desktop/LLMs/Mistral-7B-v0-1",
        "tokenizer_path": "C:/Users/Admin/Desktop/Tokenizers/Mistral-7B-v0-1",
        "hf_token": hf_token,
        "additional_config": {
            "torch_dtype": "auto",
            "device": "auto"
        }
    },
    "PHI-2": {# Size: 15 GB, system RAM: 3 GB (Windows)
        "remote_model_name": "microsoft/phi-2",
        "model_path": "C:/Users/Admin/Desktop/LLMs/PHI-2",
        "tokenizer_path": "C:/Users/Admin/Desktop/Tokenizers/PHI-2",
        "additional_config": {
            "torch_dtype": "auto",
            "trust_remote_code": True,
            "device": "auto"
        }
    }
}

In [15]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_key):
    model_info = llms_info[model_key]
    config = model_info["additional_config"]

    # Check if the directories for the model and tokenizer exist
    model_dir_exists = os.path.isdir(model_info["model_path"])
    tokenizer_dir_exists = os.path.isdir(model_info["tokenizer_path"])

    if model_dir_exists and tokenizer_dir_exists:
        print(f"{model_key} model and tokenizer are already present.")
    else:
        print(f"Downloading and saving model and tokenizer for {model_key}.")
        # Include the token in the download process if applicable
        hf_token = model_info.get("hf_token", None)
        model = AutoModelForCausalLM.from_pretrained(
            model_info["remote_model_name"],
            cache_dir=model_info["model_path"],
            torch_dtype=getattr(torch, config["torch_dtype"]) if config["torch_dtype"] != "auto" else None,
            use_auth_token=hf_token
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_info["remote_model_name"],
            cache_dir=model_info["tokenizer_path"],
            use_auth_token=hf_token
        )
        # Ensure directories are created during download
        if not model_dir_exists:
            os.makedirs(model_info["model_path"], exist_ok=True)
        if not tokenizer_dir_exists:
            os.makedirs(model_info["tokenizer_path"], exist_ok=True)
        # Save them locally
        model.save_pretrained(model_info["model_path"])
        tokenizer.save_pretrained(model_info["tokenizer_path"])

    # Load model and tokenizer from local storage
    model = AutoModelForCausalLM.from_pretrained(model_info["model_path"])
    tokenizer = AutoTokenizer.from_pretrained(model_info["tokenizer_path"])
    return model, tokenizer


In [None]:
model, tokenizer = load_model("PHI-2")

In [17]:
import warnings

warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
def compute_diagnosis_score(prompt, model, tokenizer, device):

    # Ensure that model and tokenizer are moved to the right device
    model = model.to(device)

    # Tokenize the input and ensure tensor is on the correct device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output using the model
    outputs = model.generate(**inputs, 
        max_new_tokens=30,                      # Limits the number of tokens generated
        num_return_sequences=1,                 # Ensure only one sequence is generated
        temperature=0.7,                        # Lower for more deterministic output
        top_k=30,                               # Limits the number of top tokens considered
        top_p=0.95                              # Uses nucleus sampling
    )

    # Decode the output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
   
    # Remove the prompt from the response
    actual_response = decoded_output[len(prompt):].strip()

    return actual_response


In [19]:
%%time

# Generating the response
response = compute_diagnosis_score(sample_prompt, model, tokenizer, device)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


CPU times: total: 11.2 s
Wall time: 38.4 s


In [20]:
print(response)

The answer:
{
"score": 0.5
}
----------------------------------------------------------------------
The following is a patient’s medical record. Context


*Answeres:*

- PHI-2: 
    (In 38 seconds, around 3 GB RAM usage for loading)

    The answer:
    {
    "score": 0.5
    }
    
- Mistral-7b:
    (In 4.3 minutes, around 19 GB RAM usage for loading)
    
    The answer is:
    {
    "score": 0.3
    }
    


Based on the researchers results (result2000.txt), probably the answer should be 0.2.

## OpenAI LLMs

In [29]:
# OPENAI_API_KEY="your-api-key"

In [44]:
import openai

def generate_answer(model, prompt):
    try:
        client = openai.OpenAI(api_key = OPENAI_API_KEY)
        
        # Generate a response from the model using the updated API
        chat_completion = client.chat.completions.create(
            model=model,  # Use appropriate model identifier, e.g., "gpt-3.5-turbo" or "gpt-4.0-turbo"
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        
        ## Retrieve and return the response text
        response_text = chat_completion.choices[0].message.content
        return response_text
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [51]:
# using GPT-3
answer_gpt3 = generate_answer("gpt-3.5-turbo", sample_prompt)
print("GPT-3 Answer:", answer_gpt3)

GPT-3 Answer: {
  "score": 0.4
}


In [49]:
# using GPT-4
answer_gpt4 = generate_answer("gpt-4-turbo", sample_prompt)
print("GPT-4 Answer:", answer_gpt4)

GPT-4 Answer: ```json
{
  "score": 0.1
}
```
