<a href="https://colab.research.google.com/github/somesh-awasthi/NLP-PROJECT/blob/main/NLP_Project_V1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install langchain langchain-openai
!pip install ctransformers sentence-transformers langchain-chroma
!pip install pandas nltk spacy PyPDF
%pip install --upgrade --quiet  sentence-transformers langchain-chroma langchain langchain-openai > /dev/null

In [2]:
# connecting to database
from google.colab import drive
drive.mount("/content/drive")
path="/content/drive/MyDrive/Colab Notebooks/data"

Mounted at /content/drive


In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
# Load documents from PDF
loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()



In [4]:
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords

nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Tokenization and POS tagging using SpaCy
    doc = nlp(text)

    # Filtering out tokens based on POS tags and dependency parsing
    filtered_tokens = [token.text.lower() for token in doc if token.pos_ not in ["SPACE", "X"] and token.dep_ not in ["det", "punct"]]

    # Stopword removal
    filtered_tokens = [token for token in filtered_tokens if token not in stopwords.words('english')]

    # Lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]

    return " ".join(lemmatized_tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Preprocess each document
for doc in documents:
    doc.page_content = preprocess_text(doc.page_content)

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Split the preprocessed documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 3263 documents into 13402 chunks.


In [7]:
# Save preprocessed chunks to Chroma

import os
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass('Enter your OpenAI API key:')

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(chunks, OpenAIEmbeddings(), persist_directory="./drive/MyDrive/Colab Notebooks/chroma_db")

Enter your OpenAI API key:··········


In [8]:
from langchain.prompts import ChatPromptTemplate
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [10]:
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain_community.document_transformers import (
    LongContextReorder,
)
# Start conversation loop
context_text = ""
while True:
    query_text = input("Enter your query (type 'quit' to exit): ")

    if query_text.lower() == 'quit':
        break

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=7)

    # Reorder documents
    reordering = LongContextReorder()
    reorder_docs = reordering.transform_documents(results)

    if len(reorder_docs) == 0 or reorder_docs[0][1] < 0.7:
        print(f"Unable to find matching results.")
        continue

    new_context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in reorder_docs])
    context_text += "\n\n---\n\n" + new_context_text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = ChatOpenAI()
    response_text = model.invoke(prompt)

    # Load the model
    sources = [doc.metadata.get("source", None) for doc, _score in reorder_docs]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

Enter your query (type 'quit' to exit): I'm having fever since two weeks, also sometimes vomiting with watery eyes, what could be the disease?
Response: content='Based on the symptoms described in the context, the possible disease could be viral conjunctivitis caused by adenovirus.' response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 1485, 'total_tokens': 1510}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None} id='run-dd7f7baf-6c87-4ff6-b4e1-3ace22710114-0'
Sources: ['/content/drive/MyDrive/Colab Notebooks/data/medical-diagnosis.pdf', '/content/drive/MyDrive/Colab Notebooks/data/medical-diagnosis.pdf', '/content/drive/MyDrive/Colab Notebooks/data/Medical_book.pdf', '/content/drive/MyDrive/Colab Notebooks/data/Medical_book.pdf', '/content/drive/MyDrive/Colab Notebooks/data/medical-diagnosis.pdf', '/content/drive/MyDrive/Colab Notebooks/data/medical-diagnosis.pdf', '/content/drive/MyDrive/Colab Noteb