In [1]:

!pip install datasets pandas pymongo sentence_transformers
!pip install -U transformers
# Install below if using GPU
!pip install accelerate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pymongo
  Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [22]:
# Load Dataset
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/MongoDB/embedded_movies
dataset = load_dataset("QuyenAnhDE/Diseases_Symptoms")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

Downloading readme:   0%|          | 0.00/381 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/107k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


In [23]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


dataset_df["Embedding"] = dataset_df["Symptoms"].apply(get_embedding)

dataset_df.head()

Unnamed: 0,Code,Name,Symptoms,Treatments,Embedding
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior...","[-9.219052299158648e-05, -0.000639018253423273..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal","[-0.008162464946508408, -0.023044483736157417,..."
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t...","[-0.01654217764735222, -0.024435976520180702, ..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...,"[-0.01114206574857235, 0.004298272076994181, -..."
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ...","[-0.0027906012255698442, -0.009377170354127884..."


In [58]:
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python")
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = "mongodb+srv://MinhQuy:wl1FXxz27en6ubBa@cluster0.i5zpt.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

Connection to MongoDB successful


In [59]:
# Ingest data into MongoDB
db = mongo_client['medical_db']
collection = db['medical_collection']

In [60]:
mongo_client['medical_db']['medical_collection'].count_documents({})

400

In [None]:
# Delete any existing records in the collection
collection.delete_many({})

In [28]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [64]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    vector_search_stage = {
        "$vectorSearch": {
            "index": "vector_index",
            "queryVector": query_embedding,
            "path": "Embedding",
            "numCandidates": 150,  # Number of candidate matches to consider
            "limit": 4  # Return top 4 matches
        }
    }

    unset_stage = {
        "$unset": "Embedding",  # Exclude the 'embedding' field from the results
    }

    project_stage = {
        "$project": {
            "_id": 0,
            "Name": 1,  # Include the Name field
            "Symptoms": 1,  # Include the Symptoms field
            "Treatments": 1,  # Include the Treatments field
            "score": {
                "$meta": "vectorSearchScore"  # Include the search score
            }
        }
    }

    pipeline = [vector_search_stage, unset_stage, project_stage]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [65]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        print('---result', result)
        search_result += f"Name: {result.get('Name', 'N/A')}, Symptoms: {result.get('Symptoms', 'N/A')}, Treatments: {result.get('Treatments', 'N/A')}\n"

    return search_result

In [66]:
%%time
# Conduct query with retrival of sources
query = "I have some symptoms such as severe abdominal pain, increased heart rate and bloating. How can I treat that?"
source_information = get_search_result(query, collection)
source_information

---result {'Name': 'Peritonitis', 'Symptoms': 'Severe abdominal pain, tenderness, bloating, fever, nausea, vomiting, loss of appetite, increased heart rate', 'Treatments': 'Intravenous antibiotics to treat the infection, intravenous fluids to restore hydration, drainage or removal of any underlying fluid or abscess, surgery to repair or remove the source of infection', 'score': 0.9516547322273254}
---result {'Name': 'Diverticulosis', 'Symptoms': 'Abdominal pain, bloating, changes in bowel movements', 'Treatments': 'High-fiber diet, medications (pain relievers, antibiotics)', 'score': 0.9473992586135864}
---result {'Name': 'Indigestion', 'Symptoms': 'Abdominal pain, bloating, heartburn, nausea, acidic taste', 'Treatments': 'Lifestyle changes (e.g., dietary modifications, stress reduction), over-the-counter antacids, medication, therapy', 'score': 0.9398102164268494}
---result {'Name': 'Volvulus', 'Symptoms': 'Abdominal pain, nausea, vomiting, bloating, constipation, bloody stool', 'Trea

'Name: Peritonitis, Symptoms: Severe abdominal pain, tenderness, bloating, fever, nausea, vomiting, loss of appetite, increased heart rate, Treatments: Intravenous antibiotics to treat the infection, intravenous fluids to restore hydration, drainage or removal of any underlying fluid or abscess, surgery to repair or remove the source of infection\nName: Diverticulosis, Symptoms: Abdominal pain, bloating, changes in bowel movements, Treatments: High-fiber diet, medications (pain relievers, antibiotics)\nName: Indigestion, Symptoms: Abdominal pain, bloating, heartburn, nausea, acidic taste, Treatments: Lifestyle changes (e.g., dietary modifications, stress reduction), over-the-counter antacids, medication, therapy\nName: Volvulus, Symptoms: Abdominal pain, nausea, vomiting, bloating, constipation, bloody stool, Treatments: Emergency surgery to untwist the affected organ and restore blood flow\n'

In [67]:
combined_information = f"Query: {query} \n {source_information}."

print(combined_information)

Query: I have some symptoms such as severe abdominal pain, increased heart rate and bloating. How can I treat that? 
 Name: Peritonitis, Symptoms: Severe abdominal pain, tenderness, bloating, fever, nausea, vomiting, loss of appetite, increased heart rate, Treatments: Intravenous antibiotics to treat the infection, intravenous fluids to restore hydration, drainage or removal of any underlying fluid or abscess, surgery to repair or remove the source of infection
Name: Diverticulosis, Symptoms: Abdominal pain, bloating, changes in bowel movements, Treatments: High-fiber diet, medications (pain relievers, antibiotics)
Name: Indigestion, Symptoms: Abdominal pain, bloating, heartburn, nausea, acidic taste, Treatments: Lifestyle changes (e.g., dietary modifications, stress reduction), over-the-counter antacids, medication, therapy
Name: Volvulus, Symptoms: Abdominal pain, nausea, vomiting, bloating, constipation, bloody stool, Treatments: Emergency surgery to untwist the affected organ and r

In [19]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", device_map="auto")

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [68]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: I have some symptoms such as severe abdominal pain, increased heart rate and bloating. How can I treat that? 
 Name: Peritonitis, Symptoms: Severe abdominal pain, tenderness, bloating, fever, nausea, vomiting, loss of appetite, increased heart rate, Treatments: Intravenous antibiotics to treat the infection, intravenous fluids to restore hydration, drainage or removal of any underlying fluid or abscess, surgery to repair or remove the source of infection
Name: Diverticulosis, Symptoms: Abdominal pain, bloating, changes in bowel movements, Treatments: High-fiber diet, medications (pain relievers, antibiotics)
Name: Indigestion, Symptoms: Abdominal pain, bloating, heartburn, nausea, acidic taste, Treatments: Lifestyle changes (e.g., dietary modifications, stress reduction), over-the-counter antacids, medication, therapy
Name: Volvulus, Symptoms: Abdominal pain, nausea, vomiting, bloating, constipation, bloody stool, Treatments: Emergency surgery to untwist the affected organ 