In [1]:
! pip install llama-index==0.12.9 llama-index-llms-ollama==0.5.0 llama-index-embeddings-huggingface==0.4.0 llama-index-llms-groq==0.3.1 tf-keras



In [2]:
import os

In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
from llama_index.core import SimpleDirectoryReader

In [5]:
# load teh document
documents = SimpleDirectoryReader(input_files=["The Emperor of All Maladies_ A Biography of Cancer final.pdf"]).load_data()

incorrect startxref pointer(1)
parsing for Object Streams


In [6]:
print(type(documents))

<class 'list'>


In [7]:
len(documents)

449

In [8]:
documents[0]

Document(id_='ac8f3cef-c7cf-416c-a407-efac486afa86', embedding=None, metadata={'page_label': '1', 'file_name': 'The Emperor of All Maladies_ A Biography of Cancer final.pdf', 'file_path': 'The Emperor of All Maladies_ A Biography of Cancer final.pdf', 'file_type': 'application/pdf', 'file_size': 6201653, 'creation_date': '2025-03-31', 'last_modified_date': '2025-03-31'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')

In [9]:
from llama_index.core.node_parser import SentenceSplitter

In [10]:
splitter = SentenceSplitter(chunk_size=2000,
                           chunk_overlap=20)

In [11]:
nodes = splitter.get_nodes_from_documents(documents)

In [12]:
from sentence_transformers import SentenceTransformer
# Load Sentence Transformer for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert nodes into embeddings
node_texts = [node.text for node in nodes]  # Store texts
node_embeddings = [embedding_model.encode(text, convert_to_tensor=True) for text in node_texts]  # Store embeddings

  from .autonotebook import tqdm as notebook_tqdm





In [13]:
print(type(nodes))

<class 'list'>


In [14]:
from llama_index.core import Settings
#from llama_index.llms.ollama import Ollama
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [15]:
#Settings.llm = Ollama(model="gemma:2b")

In [16]:
os.environ["GROQ_API_KEY"] = "YOUR GROQ API"

In [17]:
Settings.llm = Groq(model="llama-3.1-8b-instant")

In [18]:
Settings.embed_model = HuggingFaceEmbedding()

In [19]:
from llama_index.core import SummaryIndex, VectorStoreIndex

In [20]:
summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

In [21]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True
)

vector_query_engine = vector_index.as_query_engine()

In [22]:
from llama_index.core.tools import QueryEngineTool

In [23]:
summary_tool = QueryEngineTool.from_defaults(
    query_engine = summary_query_engine,
    description = (
        "Useful for summarization related to the  given context"
    )
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine = vector_query_engine,
    description = (
        "Useful for retrieving specific context from the given context based on the given question"
    )
)

In [24]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

In [25]:
query_engine = RouterQueryEngine(
    selector = LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool
    ],
    verbose=True
)

In [26]:
response = query_engine.query("What is hypertrophy?")
print(response)

[1;3;38;5;200mSelecting query engine 1: The question 'What is hypertrophy?' is asking for a specific definition or explanation of a term, which is more relevant to retrieving specific context from the given context based on the given question..
[0mHypertrophy refers to the growth and increase in size of cells, tissues, or organs, often resulting from an increase in the size of the cells themselves, rather than an increase in the number of cells. This can occur in various parts of the body, such as muscle tissue, where it is a common adaptation to exercise and resistance training.


In [27]:
response = query_engine.query("What is metastasis?")
print(response)

[1;3;38;5;200mSelecting query engine 1: The question 'What is metastasis?' is asking for a specific definition or explanation of metastasis, which requires retrieving specific context from the given context based on the question..
[0mMetastasis refers to the process by which cancer cells spread from the original site of a tumor to other parts of the body, often forming new tumors. This can occur through various mechanisms, including the bloodstream or lymphatic system.


In [28]:
response = query_engine.query("What is cancer?")
print(response)

[1;3;38;5;200mSelecting query engine 1: The question 'What is cancer?' is asking for a specific definition or explanation of cancer, which is a retrieval task..
[0mCancer is a complex and multifaceted disease that affects millions of people worldwide. It is characterized by the uncontrolled growth and spread of abnormal cells in the body, which can invade and damage surrounding tissues and organs. This process can lead to a wide range of symptoms, from mild to severe, and can be life-threatening if left untreated.


In [29]:
response = query_engine.query("What is ATP?")
print(response)

[1;3;38;5;200mSelecting query engine 1: The question 'What is ATP?' is asking for a specific piece of information, which is a definition. This requires retrieving specific context from the given context based on the question..
[0mATP, or Adenosine Triphosphate, is a molecule that serves as the primary energy currency of the cell. It is a high-energy compound that stores energy in the form of phosphate bonds, which can be broken down to release energy for various cellular processes.


In [30]:
response = query_engine.query("What is liver cancer?")
print(response)

[1;3;38;5;200mSelecting query engine 1: The question 'What is liver cancer?' is asking for specific information about liver cancer, which requires retrieving a specific context from the given context based on the question..
[0mLiver cancer is a type of cancer that originates in the liver, which is a vital organ responsible for various bodily functions such as detoxification, metabolism, and production of essential proteins. It can be caused by a combination of genetic and environmental factors, including viral infections, exposure to toxins, and certain lifestyle choices.


In [31]:
import torch
import pickle

with open("node_embeddings.pkl", "wb") as f:
    pickle.dump(node_embeddings, f)

with open("node_texts.pkl", "wb") as f:
    pickle.dump(node_texts, f)

In [32]:
from IPython.display import clear_output
import json
from datetime import datetime

def interactive_query():
    # JSON file to store history
    json_file = "query_history.json"

    # Load existing history if file exists
    try:
        with open(json_file, "r") as f:
            history = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        history = []

    while True:
        query = input("Enter your question about cancer (or type 'exit' to quit, 'history' to see past queries): ")

        if query.lower() == 'exit':
            print("Exiting interactive mode.")
            break

        elif query.lower() == 'history':
            if not history:
                print("No query history yet.")
            else:
                print("\n--- Query History ---")
                for entry in history:
                    print(f"[{entry['timestamp']}] {entry['query']} → {entry['response']}\n")
            continue

        print("Searching for answer...")
        response = query_engine.query(query)  # Assuming query_engine is defined elsewhere

        # Save new query-response pair with timestamp
        new_entry = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "query": query,
            "response": str(response)
        }

        # Append new data to history list
        history.append(new_entry)

        clear_output(wait=True)
        print(f"Question: {query}")
        print(f"Answer: {response}")
        print("\n---\n")

    # Save updated history to JSON (appends new queries)
    with open(json_file, "w") as f:
        json.dump(history, f, indent=4)

    print(f"✅ Query history updated and saved to '{json_file}'")

# Run the interactive query function
interactive_query()

Exiting interactive mode.
✅ Query history updated and saved to 'query_history.json'


In [33]:
! pip install rouge-score sentence-transformers

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25026 sha256=a4f91908207221410b359a1ece262717134e50c5c2c3e7627a151952629cb836
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [34]:
import numpy as np
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Load SentenceTransformer model for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# Sample test data: User queries and expected responses
test_data = [
    {"query": "What is hypertrophy?", "expected_response": "Hypertrophy is the increase in the size of cells, leading to the growth of an organ or tissue."},
    {"query": "What is metastasis?", "expected_response": "Metastasis is the spread of cancer cells to distant parts of the body."},
    {"query": "What is cancer?", "expected_response": "Cancer is a disease caused by uncontrolled cell division."},
    {"query": "What is ATP?", "expected_response": "ATP (Adenosine Triphosphate) is the primary energy carrier in cells."},
    {"query": "What is liver cancer?", "expected_response": "Liver cancer is a type of cancer that begins in the liver cells."}
]

# Initialize ROUGE scorer
rouge_scorer_obj = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

cosine_similarities = []
rouge_scores = []
fallback_count = 0

# Query chatbot and evaluate responses
for item in test_data:
    chatbot_response = query_engine.query(item["query"])
    
    # Compute Cosine Similarity
    chatbot_embedding = model.encode(str(chatbot_response), convert_to_tensor=True)
    expected_embedding = model.encode(item["expected_response"], convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(chatbot_embedding, expected_embedding).item()
    cosine_similarities.append(cosine_similarity)

    # Compute ROUGE Score
    rouge = rouge_scorer_obj.score(item["expected_response"], str(chatbot_response))
    rouge_scores.append(rouge["rouge1"].fmeasure)

    # Check if chatbot failed to answer (fallback)
    if "I don't know" in str(chatbot_response) or "I'm sorry" in str(chatbot_response):
        fallback_count += 1

# Print Evaluation Results
print(f"\n✅ Average Cosine Similarity: {np.mean(cosine_similarities):.4f}")
print(f"✅ Average ROUGE Score: {np.mean(rouge_scores):.4f}")
print(f"⚠️ Fallback Rate: {fallback_count / len(test_data) * 100:.2f}% ({fallback_count}/{len(test_data)})")

[1;3;38;5;200mSelecting query engine 1: The question 'What is hypertrophy?' is asking for a specific definition, which is a retrieval task..
[0m[1;3;38;5;200mSelecting query engine 1: The question 'What is metastasis?' is a specific query that requires a precise answer, which aligns with the second choice for retrieving specific context from the given context based on the given question..
[0m[1;3;38;5;200mSelecting query engine 1: The question 'What is cancer?' is a specific query that requires a direct answer, which is a characteristic of retrieving specific context from the given context based on the given question..
[0m[1;3;38;5;200mSelecting query engine 1: The question 'What is ATP?' is asking for a specific piece of information, which is a definition. This requires retrieving specific context from the given context based on the question..
[0m[1;3;38;5;200mSelecting query engine 1: The question 'What is liver cancer?' is asking for specific information about liver cancer,