In [None]:
import os 
os.chdir("../")

In [None]:
%pwd


In [None]:
from langchain.document_loaders import  PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_docs = load_pdf_files("data")

In [None]:
extracted_docs

In [None]:
len(extracted_docs)

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs:List[Document])-> List[Document]:
    """
     given a list of doc obj , return a new list of doc obj conatianing only "source" in metadata and original page_content 
    """
    minimal_docs:List[Document]=[]
    for doc in docs:
        src=doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs= filter_to_minimal_docs(extracted_docs)

In [None]:
minimal_docs

In [None]:
#split the data into smaller chunks
def text_split(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    text_chunks =text_splitter.split_documents(minimal_docs)
    return text_chunks

In [None]:
text_chunks = text_split(minimal_docs)
print(f"number of chunks in docs: {len(text_chunks)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    """
    download and return the HuggingFace embeddings model
    """

    model_name="sentence-transformers/all-MiniLM-L6-v2"
    embedding=HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embedding
embedding=download_embeddings()

In [None]:
vector= embedding.embed_query("Hello world")
print(vector)  

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")  

os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY
os.environ["OPENAI_API_KEY"]=OPENAI_API_KEY

In [None]:
from pinecone import Pinecone
pinecone_api_key=PINECONE_API_KEY

pc=Pinecone(api_key=pinecone_api_key)

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec

index_name="medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
         name=index_name,
         dimension=384,
         metric="cosine",
         spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index=pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    index_name=index_name
)

In [None]:
#load existing index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [None]:
retriver= docsearch.as_retriever(search_type="similarity", search_kwargs={"k":1})

In [None]:
retrieved_docs = retriver.invoke("What is diabetes?")

In [None]:
retrieved_docs

In [None]:
from langchain_openai import ChatOpenAI
chatModel= ChatOpenAI(model="gpt-3.5-turbo")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt=(
 "you are an medical assistant for question-answering tasks"
 "use the following pieces of context to answer the question at the end"
 "the question . if you don't know the answer , say that you don't know"
 "don't know . use three sentence maximum keep the answer concise "
 "\n\n"
 "{context}"
)

prompt= ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "Answer the following question: {input}")
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel,prompt)
rag_chain= create_retrieval_chain(retriver,question_answer_chain)

In [None]:
response= rag_chain.invoke(
    {"input": "What is diabetes?"})
print(response ["answer"]) 