In [None]:
import os 
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Load and Extract text from PDF files
def load_pdf_files(data):           # Function set-up to load PDF files from a directory
    loader = DirectoryLoader(       # Load all PDF files from the specified directory
        data,
        glob="*.pdf",               # Only consider files with .pdf extension
        loader_cls=PyPDFLoader      # Use PyPDFLoader loader class to handle PDF files
    )

    documents = loader.load()       # Load the documents from the directory, it returns a list of document objects (each document object has page_content and metadata of each page)
    return documents

In [None]:
extracted_data = load_pdf_files("data")  # Load and Extract PDF files from the 'data' directory

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:         # Function to filter docs. docs is a list of Document objects (extracted_data)
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []               # Initialize an empty list to hold the minimal documents, here we use hinting for type safety/production grade code
    for doc in docs:
         minimal_docs.append(                        # Append only page_content and source metadata
            Document(                               # Create a new Document object
                page_content=doc.page_content,
                metadata={"source": doc.metadata.get("source")}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(             # Configure the text splitter
        chunk_size=500,
        chunk_overlap=20                                        # Overlap between chunks to maintain context
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)   # Call the splitter on the minimal documents
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

In [None]:
texts_chunk

In [None]:
# Display the end of one chunk and the start of the next to verify overlap

for i in range(6,8):
    print(f"\nChunk {i+1} end:\n{texts_chunk[i].page_content[-50:]}")
    print(f"Chunk {i+2} start:\n{texts_chunk[i+1].page_content[:50]}")


In [None]:
# Download HuggingFace Embeddings Model

from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [None]:
embedding

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
# Get API keys from environment variables

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLAMA_API_KEY = os.getenv("LLAMA_API_KEY")

# Set API keys in environment variables

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY  
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["LLAMA_API_KEY"] = LLAMA_API_KEY

In [None]:
# Initialize Pinecone client with the API key

from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
# Create Pinecone index if it doesn't exist to store the embeddings

from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,  
        dimension=384,      # Dimension of the embeddings that will be stored in the index
        metric= "cosine",   # Cosine technique for calculating the similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)    # Establish connection with the Pinecone index

In [None]:
# Create Pinecone Vector Store using LangChain for handling document embeddings 

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(     # Embed each chunk and upsert the embeddings into the Pinecone index
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [None]:
docsearch = PineconeVectorStore.from_existing_index(    # Use this for connecting to the existing Pinecone index and retrieve the embeddings
    index_name=index_name,
    embedding=embedding
)

# Add more data to the existing Pinecone index

In [None]:
# Creature a sample Document object

dswith = Document(
    page_content="this is a sampple text chunk to test adding a document object in the pinecone index.",
    metadata={"source": "VSCode"}
)

In [None]:
docsearch.add_documents(documents=[dswith])     # Add the sample document to the Pinecone index using the docsearch vector store

In [None]:
# Create a retriever object from the docsearch vector store...
# ...to retrieve the top 3 most similar documents based on cosine similarity

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")      # Use the retriever to find the most relevant documents... 
retrieved_docs                                          # ...based on the query "What is Acne?"

project steps continued

In [None]:
from langchain_openai import ChatOpenAI
# from langchain_llama import ChatLlama

chatModel = ChatOpenAI(model="gpt-4o")
# chatModel = ChatLlama(model="llama-2-70b-chat-hf")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

"""
    The below prompt template structure makes it multi-turn.
    Each message is treated as part of a conversation history, not a signle prompt.
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
# Create the RAG chain using the retriever and the LLM model

question_answer_chain = create_stuff_documents_chain(chatModel, prompt)     # Create a chain that combines the retrieved documents and the LLM to generate an answer based on the context and the question
rag_chain = create_retrieval_chain(retriever, question_answer_chain)        # Create a retrieval chain that first retrieves relevant documents using the retriever and then passes them to the question_answer_chain
# In above line docs retrieved by the retriever are automatically passed to the create_stuff_documents_chain as context. We don't have to explicitly pass them in the prompt template.

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "is there are any sample text?"})
print(response["answer"])