In [1]:
# print("test")

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = '../data/medical-book.pdf'
loader = PyPDFLoader(file_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
docs = loader.load()
docs[0]

Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': '../data/medical-book.pdf', 'total_pages': 4505, 'page': 0, 'page_label': 'i'}, page_content='')

In [4]:
import pprint
print(len(docs))
pprint.pp(docs[0])

4505
Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': '../data/medical-book.pdf', 'total_pages': 4505, 'page': 0, 'page_label': 'i'}, page_content='')


In [5]:
docs[0].metadata

{'producer': 'PDFlib+PDI 6.0.3 (SunOS)',
 'creator': 'Adobe Acrobat 6.0',
 'creationdate': '2006-10-16T20:19:33+02:00',
 'moddate': '2006-10-16T22:03:45+02:00',
 'source': '../data/medical-book.pdf',
 'total_pages': 4505,
 'page': 0,
 'page_label': 'i'}

In [6]:
filtered_docs = [d for d in docs if 
                 d.page_content and d.page_content.strip()]

ALLOWED_META = ['source','page']
for d in filtered_docs:
    d.metadata = {k:v for k,v in d.metadata.items() if k in ALLOWED_META}

docs = filtered_docs
pprint.pp(docs[1])

Document(metadata={'source': '../data/medical-book.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n1\nA-B\nJACQUELINE L. LONGE, PROJECT EDITOR')


In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunker = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
chunks = chunker.split_documents(docs)

In [31]:
print(len(chunks))
print(chunks[200])

45494
page_content='<http://www.acg.gi.org>.
American Institute of Ultrasound in Medicine. 14750
Sweitzer Lane, Suite 100, Laurel, MD 20707-5906.
(800) 638-5352. <http://www.aium.org>.
American Society of Radiologic Technologists. 15000
Central Ave., SE, Albuquerque, NM 87123-3917. (505)
298-4500. <http://www.asrt.org>.
Kurt Richard Sternlof
Abdominal wall defects
Definition
Abdominal wall defects are birth (congenital)
defects that allow the stomach or intestines to
protrude.
Description' metadata={'source': '../data/medical-book.pdf', 'page': 34}


In [26]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name ="sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [27]:
test_embedded = embedding.embed_query("Hello bro what's up ??")
print(len(test_embedded))
print(test_embedded[:10])

384
[-0.04287089407444, -0.06428621709346771, 0.04534759372472763, 0.005460003390908241, 0.0009019588469527662, -0.046489305794239044, 0.04170653596520424, -0.034828752279281616, 0.0437304750084877, -0.012657349929213524]


In [11]:
from dotenv import load_dotenv

load_dotenv()

True

In [12]:
import os 

os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [28]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

index_name = "medibot-rag"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type='dense',
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region="us-east-1"   
        )
        
    )

In [18]:
index = pc.Index(index_name)

In [32]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embedding,
    index_name=index_name,
)

In [33]:
results = docsearch.similarity_search(
    "An acne is",
    k=3,
    filter={"source":"../data/medical-book.pdf"}
)
for res in results:
    print(f"{res.page_content} -- [{res.metadata}]")

Acne is a common skin disease characterized by
pimples on the face, chest, and back. It occurs when
the pores of the skin become clogged with oil, dead
skin cells, and bacteria.
Description
Acne vulgaris, the medical term for common acne,
is the most common skin disease. It affects nearly 17
million people in the United States. While acne can
arise at any age, it usually begins atpuberty and wor-
sens during adolescence. Nearly 85% of people
develop acne at some time between the ages of 12-25 -- [{'page': 54.0, 'source': '../data/medical-book.pdf'}]
KEY TERMS
Acne— A chronic inflammation of the sebaceous
glands that manifests as blackheads, whiteheads,
and/or pustules on the face or trunk.
Psoriasis— A skin disorder of chronic, itchy scaling
most commonly at sites of repeated minor trauma
(e.g. elbows, knees, and skin folds). It affects up to
2% of the population in Western countries—males
and females equally.
Rosacea— A chronic inflammation of the face, with
associated scattered round

In [34]:
retriever = docsearch.as_retriever(search_type="similarity",
                                   search_kwargs={"k":3})
retriever.invoke("An acne is")

[Document(id='1dc7344e-f6aa-4117-ab22-3a585d79f318', metadata={'page': 54.0, 'source': '../data/medical-book.pdf'}, page_content='Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when\nthe pores of the skin become clogged with oil, dead\nskin cells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne,\nis the most common skin disease. It affects nearly 17\nmillion people in the United States. While acne can\narise at any age, it usually begins atpuberty and wor-\nsens during adolescence. Nearly 85% of people\ndevelop acne at some time between the ages of 12-25'),
 Document(id='af0dd434-29ca-44f8-bc01-7f56ba8e9658', metadata={'page': 3353.0, 'source': '../data/medical-book.pdf'}, page_content='KEY TERMS\nAcne— A chronic inflammation of the sebaceous\nglands that manifests as blackheads, whiteheads,\nand/or pustules on the face or trunk.\nPsoriasis— A skin disorder of chronic, itchy scaling\nmost commonly at sites of 

In [35]:
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain 
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)

In [36]:
llm = ChatOpenAI(
    api_key=os.environ['OPENAI_API_KEY'],
    model='gpt-4o-mini',
    temperature=0.0
)
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [38]:
response = rag_chain.invoke({"input":"What is acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back, resulting from clogged pores due to oil, dead skin cells, and bacteria. It is medically known as acne vulgaris and affects nearly 17 million people in the United States, primarily during puberty and adolescence. The condition manifests as blackheads, whiteheads, and pustules.
