In [None]:
%%capture
!pip install langchain langchain-openai langchainhub
!pip install ctransformers sentence-transformers langchain-chroma
!pip install pandas nltk spacy PyPDF
%pip install --upgrade --quiet  sentence-transformers langchain-chroma langchain langchain-openai > /dev/null

In [None]:
# connecting to database
from google.colab import drive
drive.mount("/content/drive")
path="/content/drive/MyDrive/Colab Notebooks/nlp/data/book"

Mounted at /content/drive


In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
# Load documents from PDF
loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
def partition_doc():
    raw_pdf_elements = partition_pdf(
        filename= DATA_PATH + "/docmerged.pdf",
        extract_images_in_pdf=False,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=6000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )

    # Categorize by type
    tables = []
    texts = []

    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))

    print(len(tables), " ", len(texts))
    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text. \
    Give a concise summary of the table or text. Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Summary chain
    model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return table_summaries, texts

In [None]:
def preprocess_text(text):
    # Tokenization and POS tagging using SpaCy
    doc = nlp(text)

    # Filtering out tokens based on POS tags and dependency parsing
    filtered_tokens = []
    for token in doc:
        if token.pos_ not in ["SPACE", "X"]:
            if token.dep_ not in ["det", "punct"]:
                filtered_tokens.append(token.text.lower())

    # Stopword removal
    filtered_tokens = [token for token in filtered_tokens if token not in stopwords.words('english')]

    # Lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]

    return " ".join(lemmatized_tokens)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

for doc in documents:
  doc.page_content = preprocess_text(doc.page_content)

# Split the preprocessed documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 4762 documents into 38997 chunks.


In [None]:
# Save preprocessed chunks to Chroma

import os
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass('Enter your OpenAI API key:')
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass('Enter your LANGCHAIN API key:')

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
# db = Chroma.from_documents(chunks, OpenAIEmbeddings(), persist_directory="./drive/MyDrive/Colab Notebooks/nlp/chroma_db-v(Harrison-mehta)")
db = Chroma(embedding_function=OpenAIEmbeddings(), persist_directory="./drive/MyDrive/Colab Notebooks/nlp/chroma_db-v(Harrison-mehta)")

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp/data/disease symptom prediction/testset.csv')
df.head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Disease   4920 non-null   object
 1   Symptoms  4920 non-null   object
dtypes: object(2)
memory usage: 77.0+ KB


In [None]:
df = df.sample(n=10)
print(df.head(10))

                  Disease                                           Symptoms
4287          Hepatitis E   joint_pain, vomiting, fatigue, high_fever, ye...
4135            Arthritis   muscle_weakness, stiff_neck, swelling_joints,...
3585              Typhoid   chills, vomiting, fatigue, high_fever, headac...
68                   AIDS   muscle_wasting, high_fever, extra_marital_con...
4904         Tuberculosis   chills, vomiting, fatigue, weight_loss, cough...
1696  Peptic ulcer diseae   vomiting, indigestion, loss_of_appetite, abdo...
4102                 GERD   stomach_pain, acidity, ulcers_on_tongue, vomi...
1073         Tuberculosis   chills, fatigue, weight_loss, cough, high_fev...
4094            Arthritis   muscle_weakness, stiff_neck, swelling_joints,...
3930            Arthritis   muscle_weakness, stiff_neck, swelling_joints,...


In [None]:
from langchain.prompts import ChatPromptTemplate
PROMPT_TEMPLATE = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""
prompt=ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="\nUse the following pieces of information to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nContext: {context}\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n"))])

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain_community.document_transformers import (
    LongContextReorder,
)
# Start conversation loop
context_text = ""
while True:
    query_text = input("Enter your query (type 'quit' to exit): ")

    if query_text.lower() == 'quit':
        break

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=7)

    # Reorder documents
    reordering = LongContextReorder()
    reorder_docs = reordering.transform_documents(results)

    if len(reorder_docs) == 0 or reorder_docs[0][1] < 0.7:
        print(f"Unable to find matching results.")
        continue

    new_context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in reorder_docs])
    context_text += "\n\n---\n\n" + new_context_text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = ChatOpenAI()
    response_text = model.invoke(prompt)

    # Load the model
    sources = [doc.metadata.get("source", None) for doc, _score in reorder_docs]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [None]:
from langchain.prompts import ChatPromptTemplate
PROMPT_TEMPLATE_COMPARE = """
given you are 2 answer one is actual disease and another one is predicted disease sentence
your task is to give ans in 0 or 1
where 0 is both actual_disease and predicted_disease_sentence are different
where 1 is both actual_disease and predicted_disease_sentence are similar
actual ans : {actual_disease}
predicted ans : {predicted_disease_sentence}
Only return the answer in 0 or 1
"""
prompt_compare=ChatPromptTemplate.from_template(PROMPT_TEMPLATE_COMPARE)
prompt_compare

ChatPromptTemplate(input_variables=['actual_disease', 'predicted_disease_sentence'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['actual_disease', 'predicted_disease_sentence'], template='\ngiven you are 2 answer one is actual disease and another one is predicted disease sentence \nyour task is to give ans in 0 or 1 \nwhere 0 is both actual_disease and predicted_disease_sentence are different\nwhere 1 is both actual_disease and predicted_disease_sentence are similar\nactual ans : {actual_disease}\npredicted ans : {predicted_disease_sentence}\nOnly return the answer in 0 or 1\n'))])

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_transformers import LongContextReorder

# Start conversation loop
def search(query_text):
    # Search the DB.
    context_text = ""
    results = db.similarity_search_with_relevance_scores(query_text, k=7)

    # Reorder documents
    reordering = LongContextReorder()
    reorder_docs = reordering.transform_documents(results)

    if len(reorder_docs) == 0 or reorder_docs[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    new_context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in reorder_docs])
    context_text += "\n\n---\n\n" + new_context_text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = ChatOpenAI() | StrOutputParser()
    response_text = model.invoke(prompt)

    # Load the model
    sources = [doc.metadata.get("source", None) for doc, _score in reorder_docs]
    return response_text

def compare(actual_disease, predicted_disease_sentence):
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_COMPARE)
    prompt_compare = prompt_template.format(actual_disease=actual_disease, predicted_disease_sentence=predicted_disease_sentence)
    model = ChatOpenAI() | StrOutputParser()
    response_text = model.invoke(prompt_compare)
    print(response_text)
    if '0' in response_text:
        return 0
    elif '1' in response_text:
        return 1
    else:
        return 0

In [None]:
correct_predictions = 0
total_predictions = len(df)
# Iterate through each row in the test dataset
for index, row in df.iterrows():
    symptoms = row['Symptoms']
    actual_disease = row['Disease']
    predicted_disease_sentence = search(symptoms)
    print(f"\nsymptoms : {symptoms} \n actual_disease : {actual_disease}\npredicted_disease_sentence : {predicted_disease_sentence}\n\n")
    correct_predictions += compare(actual_disease,predicted_disease_sentence)

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)



symptoms :  joint_pain, vomiting, fatigue, high_fever, yellowish_skin, dark_urine, nausea, loss_of_appetite, abdominal_pain, yellowing_of_eyes, acute_liver_failure, coma, stomach_bleeding 
 actual_disease : Hepatitis E
predicted_disease_sentence : The symptoms mentioned are suggestive of liver disease, possibly hepatitis or liver failure. It is important to consult a healthcare provider for proper evaluation and diagnosis.


1

symptoms :  muscle_weakness, stiff_neck, swelling_joints, movement_stiffness, painful_walking 
 actual_disease : Arthritis
predicted_disease_sentence : The information provided suggests that the individual may be experiencing symptoms related to inflammatory myopathies, arthritis, and stiff-person syndrome. It is recommended to consult with a healthcare professional for a proper diagnosis and treatment plan.


1

symptoms :  chills, vomiting, fatigue, high_fever, headache, nausea, constipation, abdominal_pain, diarrhoea, toxic_look_(typhos), belly_pain 
 actual