In [3]:
!pip install transformers datasets faiss-gpu faiss-cpu

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-gpu
  Using cached faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached datasets-3.1.0-py3-none-any.whl (480 kB)
Using cached faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [4]:
import faiss
import numpy as np
from transformers import AutoTokenizer, TFAutoModel, AutoModelForCausalLM
from datasets import load_dataset
import tensorflow as tf
import json
import torch
from tqdm import tqdm

# Dataset and model configuration
DATASET_NAME = "QuyenAnhDE/Diseases_Symptoms"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GENERATIVE_MODEL_NAME = "microsoft/Phi-3.5-mini-instruct"
INDEX_FILE = "diseases_symptoms_index.faiss"
EMBEDDINGS_FILE = "diseases_symptoms_embeddings.npy"
DOCS_FILE = "diseases_symptoms_docs.json"

# Set up device
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
print(f"Using device: {device}")

# Load the Diseases_Symptoms dataset
print("Loading dataset...")
dataset = load_dataset(DATASET_NAME, split="train")

# Preprocess dataset into a format suitable for retrieval
def preprocess_dataset(dataset):
    """Extract and preprocess diseases and symptoms data."""
    documents = []
    for entry in dataset:
        disease = entry["Name"]
        symptoms = entry["Symptoms"]
        documents.append({
            "disease": disease,
            "symptoms": symptoms,
            "combined": f"Disease: {disease}. Symptoms: {symptoms}"
        })
    return documents

print("Processing dataset...")
documents = preprocess_dataset(dataset)

# Save the documents for later use
with open(DOCS_FILE, "w", encoding="utf-8") as f:
    json.dump(documents, f)
print(f"Processed and saved {len(documents)} records.")

# Load embedding model
print("Loading embedding model...")
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
model = TFAutoModel.from_pretrained(EMBEDDING_MODEL_NAME)

# Generate embeddings for each document
def generate_embeddings(documents, batch_size=16):
    """Generate embeddings for documents."""
    embeddings = []
    for i in tqdm(range(0, len(documents), batch_size), desc="Generating embeddings"):
        batch_texts = [doc["combined"] for doc in documents[i:i + batch_size]]
        inputs = tokenizer(batch_texts, return_tensors="tf", padding=True, truncation=True, max_length=512)
        with tf.device(device):
            outputs = model(inputs)
        batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

print("Generating embeddings...")
embeddings = generate_embeddings(documents)

# Save embeddings for reuse
np.save(EMBEDDINGS_FILE, embeddings)
print(f"Saved embeddings for {len(embeddings)} records.")

# Build and save FAISS index
print("Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype("float32"))
faiss.write_index(index, INDEX_FILE)
print(f"Saved FAISS index to {INDEX_FILE}.")

# Load retrieval components
def load_retrieval_components():
    """Load documents, embeddings, and FAISS index."""
    with open(DOCS_FILE, "r", encoding="utf-8") as f:
        docs = json.load(f)
    embeddings = np.load(EMBEDDINGS_FILE)
    index = faiss.read_index(INDEX_FILE)
    return docs, embeddings, index

# Retrieve top-k relevant documents
def retrieve_documents(query, docs, index, top_k=5):
    """Retrieve relevant documents based on a query."""
    inputs = tokenizer([query], return_tensors="tf", padding=True, truncation=True, max_length=512)
    with tf.device(device):
        outputs = model(inputs)
    query_embedding = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().astype("float32")
    distances, indices = index.search(query_embedding, top_k)
    return [docs[idx] for idx in indices[0]]

# Load generative model for response generation
print("Loading generative model...")
gen_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
gen_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True).to("cuda")

# Generate diagnostic response
def generate_response(query, retrieved_docs, max_new_tokens=100):
    """Generate a response based on query and retrieved documents."""
    context = " ".join([f"Disease: {doc['disease']}. Symptoms: {doc['symptoms']}" for doc in retrieved_docs])
    input_text = f"User Query: {query}\n\nRelevant Information:\n{context}\n\nAnswer:"
    inputs = gen_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = gen_model.generate(inputs["input_ids"], max_new_tokens=max_new_tokens)
    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the decision support system
print("Testing the decision support system...")
docs, embeddings, index = load_retrieval_components()

test_query = "What could be the possible diagnosis for fatigue, weight loss, and frequent urination?"
retrieved_docs = retrieve_documents(test_query, docs, index)

print(f"Query: {test_query}")
print("\nRetrieved Contexts:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"{i}. Disease: {doc['disease']}")
    print(f"   Symptoms: {doc['symptoms']}")

response = generate_response(test_query, retrieved_docs)
print("\nGenerated Response:")
print(response)

print("Decision support system ready.")


Using device: /GPU:0
Loading dataset...


README.md:   0%|          | 0.00/381 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Diseases_Symptoms.csv:   0%|          | 0.00/107k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Processing dataset...
Processed and saved 400 records.
Loading embedding model...


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Generating embeddings...


Generating embeddings: 100%|██████████| 25/25 [00:04<00:00,  6.10it/s]


Saved embeddings for 400 records.
Building FAISS index...
Saved FAISS index to diseases_symptoms_index.faiss.
Loading generative model...


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Testing the decision support system...
Query: What could be the possible diagnosis for fatigue, weight loss, and frequent urination?

Retrieved Contexts:
1. Disease: Chronic Kidney Disease
   Symptoms: Fatigue, swelling of the legs or ankles, decreased appetite, difficulty concentrating, increased urination or urine changes, blood in urine, high blood pressure
2. Disease: Urinary Stones (Kidney Stones)
   Symptoms: Severe abdominal or back pain, blood in urine, frequent urination, pain during urination
3. Disease: Urinary Tract Infection (UTI)
   Symptoms: Frequent urination, burning sensation during urination, cloudy or bloody urine, pelvic pain
4. Disease: Urinary Tract Infection (UTI)
   Symptoms: Painful urination, frequent urination, pelvic pain, cloudy urine
5. Disease: Urethral Valves
   Symptoms: Difficulty or poor urinary stream, urinary tract infections, urinary frequency or urgency, abdominal distension, poor weight gain


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48



Generated Response:
User Query: What could be the possible diagnosis for fatigue, weight loss, and frequent urination?

Relevant Information:
Disease: Chronic Kidney Disease. Symptoms: Fatigue, swelling of the legs or ankles, decreased appetite, difficulty concentrating, increased urination or urine changes, blood in urine, high blood pressure Disease: Urinary Stones (Kidney Stones). Symptoms: Severe abdominal or back pain, blood in urine, frequent urination, pain during urination Disease: Urinary Tract Infection (UTI). Symptoms: Frequent urination, burning sensation during urination, cloudy or bloody urine, pelvic pain Disease: Urinary Tract Infection (UTI). Symptoms: Painful urination, frequent urination, pelvic pain, cloudy urine Disease: Urethral Valves. Symptoms: Difficulty or poor urinary stream, urinary tract infections, urinary frequency or urgency, abdominal distension, poor weight gain

Answer: The possible diagnosis for fatigue, weight loss, and frequent urination could be 