In [1]:
from bs4 import BeautifulSoup, Tag
import os

class StructuredXMLLoader:
    def __init__(self, file_path):
        with open(file_path, 'r') as file:
            content = file.read()
        self.soup = BeautifulSoup(content, 'lxml')

    def load(self):
        return self.parse_element(self.soup)

    def parse_element(self, element):
        if isinstance(element, Tag):
            if element.contents:
                nested = [self.parse_element(child) for child in element.contents if not isinstance(child, str)]
                if nested:
                    return {element.name: nested if len(nested) > 1 else nested[0]}
                return {element.name: element.text.strip()}
            else:
                return {element.name: element.text.strip()}
        return {}

directory = "./MedQuAD-master/1_CancerGov_QA"
items = os.listdir(directory)
files = [directory+'/'+item for item in items if os.path.isfile(os.path.join(directory, item))]
all_data = []
for file in files:
    loader = StructuredXMLLoader(file)
    data = loader.load()
    all_data.append(data)
#data
#data['[document]']['html']['body']['document'][2]['qapairs']



In [2]:
from langchain.docstore.document import Document
def extract_qapairs(document):
    qapairs = []
    # Navigate through the nested structure to the 'qapairs' list
    for item in document['[document]']['html']['body']['document']:
        if 'focus' in item:
            src = item['focus']
            
        if 'qapairs' in item:
            for qapair in item['qapairs']:
                try:
                    question = qapair['qapair'][0]['question']
                    answer = qapair['qapair'][1]['answer']
                    qapairs.append(Document(page_content=question+answer, metadata={"source": src}))
                except:
                    pass
    return qapairs

qapairs = []
for data in all_data:
    qapairs.extend(extract_qapairs(data))

In [3]:
qapairs[0]

Document(page_content='What is (are) Adult Acute Lymphoblastic Leukemia ?Key Points\n                    - Adult acute lymphoblastic leukemia (ALL) is a type of cancer in which the bone marrow makes too many lymphocytes (a type of white blood cell).    - Leukemia may affect red blood cells, white blood cells, and platelets.    - Previous chemotherapy and exposure to radiation may increase the risk of developing ALL.    - Signs and symptoms of adult ALL include fever, feeling tired, and easy bruising or bleeding.     - Tests that examine the blood and bone marrow are used to detect (find) and diagnose adult ALL.    - Certain factors affect prognosis (chance of recovery) and treatment options.\n                \n                \n                    Adult acute lymphoblastic leukemia (ALL) is a type of cancer in which the bone marrow makes too many lymphocytes (a type of white blood cell).\n                    Adult acute lymphoblastic leukemia (ALL; also called acute lymphocytic leukemi

In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chains import ConversationChain

In [5]:
def load_db(qapairs, chain_type, k):
    documents = qapairs
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    
    embeddings = OpenAIEmbeddings()
    
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    
    llm_name = "gpt-3.5-turbo"
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa

In [6]:
qa = load_db(qapairs, "refine", 4)



In [7]:
chat_history = []
query = "What can be the stages of Gastrointestinal Stromal Tumors in summary?"
result = qa({"question": query, "chat_history": chat_history})

  warn_deprecated(


In [8]:
print(result["answer"])

The stages of Gastrointestinal Stromal Tumors (GIST) can be summarized as follows:

1. Stage I: The tumor is small and localized, with no spread to nearby lymph nodes or distant sites.
2. Stage II: The tumor is larger and may have grown into nearby tissues, but has not spread to lymph nodes or distant sites.
3. Stage III: The tumor has invaded nearby tissues and may have spread to nearby lymph nodes, but has not metastasized to distant sites.
4. Stage IV: The tumor has metastasized to distant sites in the body, such as the liver, lungs, or bones.

These stages help determine the extent of the cancer and guide treatment decisions, taking into account factors such as genetic predisposition, symptoms like blood in the stool or vomit, and the size and location of the tumor within the gastrointestinal tract. Treatment options for GIST include surgery, targeted therapy, watchful waiting, and supportive care. Patients may also consider participating in clinical trials to access new types of t

In [9]:
result['source_documents']

[Document(page_content='What are the stages of Gastrointestinal Stromal Tumors ?Key Points\n                    - After a gastrointestinal stromal tumor has been diagnosed, tests are done to find out if cancer cells have spread within the gastrointestinal tract or to other parts of the body.    - There are three ways that cancer spreads in the body.    - Cancer may spread from where it began to other parts of the body.    - The results of diagnostic and staging tests are used to plan treatment.\n                \n                \n                    After a gastrointestinal stromal tumor has been diagnosed, tests are done to find out if cancer cells have spread within the gastrointestinal tract or to other parts of the body.', metadata={'source': 'Gastrointestinal Stromal Tumors'}),
 Document(page_content='What is (are) Gastrointestinal Stromal Tumors ?Key Points\n                    - Gastrointestinal stromal tumor is a disease in which abnormal cells form in the tissues of the gastr

In [10]:
chat_history = []
while True:
    query = "What can be the stages of Gastrointestinal Stromal Tumors in summary?"
    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append(query + '\n' + result["answer"])
    print(chat_history)
    break

['What can be the stages of Gastrointestinal Stromal Tumors in summary?\nThe stages of Gastrointestinal Stromal Tumors can be summarized as follows:\n\n1. Stage I: The tumor is small and localized, with no spread to nearby lymph nodes or distant sites.\n2. Stage II: The tumor is larger and may have grown into nearby tissues, but has not spread to lymph nodes or distant sites.\n3. Stage III: The tumor has invaded nearby tissues and may have spread to nearby lymph nodes, but has not metastasized to distant sites.\n4. Stage IV: The tumor has metastasized to distant sites in the body, such as the liver, lungs, or bones.\n\nThese stages help determine the extent of the cancer and guide treatment decisions, taking into account factors such as tumor size, invasion of nearby tissues, lymph node involvement, and distant metastasis. Treatment options for Gastrointestinal Stromal Tumors include surgery, targeted therapy, watchful waiting, and supportive care. Additionally, new types of treatment 

In [18]:
#

In [83]:
class QA:
    def __init__(self):
        self.llm_name = "gpt-3.5-turbo"
        self.llm = ChatOpenAI(model_name=self.llm_name, temperature=0)
        self.memory = ConversationSummaryBufferMemory(llm=self.llm, max_token_limit=100, memory_key='chat_history', output_key='answer')
        
    def load_db(self, qapairs, chain_type, k):
        documents = qapairs
    
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
        docs = text_splitter.split_documents(documents)
        
        embeddings = OpenAIEmbeddings()
        
        db = DocArrayInMemorySearch.from_documents(docs, embeddings)
        
        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
        
        self.qa = ConversationalRetrievalChain.from_llm(
            llm=self.llm, 
            chain_type=chain_type, 
            retriever=retriever, 
            return_source_documents=True,
            return_generated_question=True, 
            memory = self.memory, 
            get_chat_history = lambda h : h
        )

    def run(self, query):
        result = self.qa({"question": query})
        #print(result)
        #self.memory.save_context({"input": query}, 
        #                         {"output": result["answer"]})
        return result

In [84]:
qa = QA()
qa.load_db(qapairs, "stuff", 4)

In [85]:
query = "What can be the stages of Gastrointestinal Stromal Tumors in summary?"
qa.run(query)

{'question': 'What can be the stages of Gastrointestinal Stromal Tumors in summary?',
 'chat_history': '',
 'answer': 'The stages of Gastrointestinal Stromal Tumors are determined after diagnosis through tests to find out if cancer cells have spread within the gastrointestinal tract or to other parts of the body. The stages help in planning the treatment.',
 'source_documents': [Document(page_content='What are the stages of Gastrointestinal Stromal Tumors ?Key Points\n                    - After a gastrointestinal stromal tumor has been diagnosed, tests are done to find out if cancer cells have spread within the gastrointestinal tract or to other parts of the body.    - There are three ways that cancer spreads in the body.    - Cancer may spread from where it began to other parts of the body.    - The results of diagnostic and staging tests are used to plan treatment.\n                \n                \n                    After a gastrointestinal stromal tumor has been diagnosed, tes

In [86]:
qa.run('What can you tell me about lung cancer?')

{'question': 'What can you tell me about lung cancer?',
 'chat_history': 'Human: What can be the stages of Gastrointestinal Stromal Tumors in summary?\nAI: The stages of Gastrointestinal Stromal Tumors are determined after diagnosis through tests to find out if cancer cells have spread within the gastrointestinal tract or to other parts of the body. The stages help in planning the treatment.',
 'answer': 'Lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. It is the leading cause of cancer death in the United States for both men and women. There are two main types of lung cancer: non-small cell lung cancer and small cell lung cancer. Various factors can increase or decrease the risk of developing lung cancer.',
 'source_documents': [Document(page_content='What is (are) Lung Cancer ?Key Points\n                    - Lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung.    - Lung cancer is the leading cause of 

In [87]:
qa.run('What did I ask you about before?')

{'question': 'What did I ask you about before?',
 'chat_history': 'System: The human asks about the stages of Gastrointestinal Stromal Tumors. The AI explains that the stages are determined after diagnosis through tests to see if cancer cells have spread within the gastrointestinal tract or to other parts of the body, and that the stages help in planning the treatment.\nHuman: What can you tell me about lung cancer?\nAI: Lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. It is the leading cause of cancer death in the United States for both men and women. There are two main types of lung cancer: non-small cell lung cancer and small cell lung cancer. Various factors can increase or decrease the risk of developing lung cancer.',
 'answer': 'You asked about where in the body the tumor recurred, how much time passed between the end of cancer treatment and when the cancer recurred, and whether the tumor was treated with radiation therapy.',
 'source_d

In [88]:
qa.run('tell me about brain tumor?')

{'question': 'tell me about brain tumor?',
 'chat_history': 'System: The human asks about the stages of Gastrointestinal Stromal Tumors. The AI explains that the stages are determined after diagnosis through tests to see if cancer cells have spread within the gastrointestinal tract or to other parts of the body, and that the stages help in planning the treatment. The human then asks about lung cancer, and the AI explains that it is a leading cause of cancer death in the United States with two main types: non-small cell lung cancer and small cell lung cancer. Various factors can increase or decrease the risk of developing lung cancer.\nHuman: What did I ask you about before?\nAI: You asked about where in the body the tumor recurred, how much time passed between the end of cancer treatment and when the cancer recurred, and whether the tumor was treated with radiation therapy.',
 'answer': "There are different types of brain tumors, including astrocytic tumors. The treatment options for a

In [89]:
# implement ground truth and a template for the model to generate based on the documents and do not generate from yourself