In [2]:
%pwd


'd:\\Project\\medical_chat_bot\\medical-chat-bot\\research'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
## extract text from pdf files
def load_pdf_files(data):
    loader=DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents=loader.load()
    return documents
    

In [5]:
extracted_data=load_pdf_files("data")

In [6]:
len(extracted_data)

637

In [7]:
## data that only need is content (filter operation)

from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [8]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [9]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [10]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5859


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [12]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
vector=embedding.embed_query("hello shubham")
vector

[-0.04086869955062866,
 0.023624388501048088,
 0.00789125170558691,
 0.045094557106494904,
 -0.058257244527339935,
 -0.056100789457559586,
 0.11548370867967606,
 -0.006508739199489355,
 -0.010002585127949715,
 -0.09063243120908737,
 0.010869050398468971,
 -0.050015613436698914,
 0.028867637738585472,
 -0.0073052458465099335,
 0.014772206544876099,
 0.00860617682337761,
 0.023435337468981743,
 -0.04403690621256828,
 -0.11360635608434677,
 -0.01792936585843563,
 -0.06511654704809189,
 -0.02548929676413536,
 -0.0246730949729681,
 0.0182252898812294,
 -0.04478014260530472,
 -0.01735197752714157,
 0.003250259906053543,
 0.05477423593401909,
 0.006844806019216776,
 -0.040267422795295715,
 0.055449169129133224,
 0.07809881120920181,
 0.08657767623662949,
 0.017208972945809364,
 0.011137226596474648,
 0.06839694082736969,
 -0.05235503613948822,
 0.03679781034588814,
 0.021153492853045464,
 -0.03305024281144142,
 -0.03903850540518761,
 -0.03348669409751892,
 -0.0013776784762740135,
 -0.00797325

In [14]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [29]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [30]:
from pinecone import Pinecone
pinecone_api_key=PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [17]:
pc

<pinecone.pinecone.Pinecone at 0x2d2972d3760>

In [18]:
from pinecone import ServerlessSpec

index_name="medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index=pc.Index(index_name)

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [31]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='1f692748-c2d1-4500-ba10-c46b17d6d9e0', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='5d7abbfc-0f4a-4e12-9a17-752033d589d6', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='696d8c36-e4f6-4a21-9c45-8479c6bc1ea0', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged 

In [33]:
from langchain_openai import ChatOpenAI

chatModel=ChatOpenAI(model="gpt-4o")

In [34]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [35]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [36]:
question_answer_chain=create_stuff_documents_chain(chatModel,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [37]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}