In [3]:
PINECONE_API_KEY = ""
PINECONE_API_ENV = ""

In [1]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.llms import CTransformers
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import os

# Configuration
PINECONE_API_KEY = ""
INDEX_NAME = ""
REGION = "us-east-1"
DIM = 384

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_ENVIRONMENT"] = REGION

# 1. Load PDF documents
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyMuPDFLoader
    )
    documents = loader.load()
    return documents

# 2. Split text into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

# 3. Load documents and create chunks
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)
print(f"Length of my chunks: {len(text_chunks)}")

# 4. Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
print(f"Embedding dimension: {len(embeddings.embed_query('test'))}")

# 5. Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=REGION),
    )

index = pc.Index(INDEX_NAME)
print(f"Current index stats: {index.describe_index_stats()}")

# 6. Prepare and upsert vectors to Pinecone
texts = [t.page_content for t in text_chunks]
ids = [f"doc-{i}" for i in range(len(texts))]

# Generate embeddings
vectors = embeddings.embed_documents(texts)
print(f"✅ Embedding dimensions: {len(vectors[0])}")

# Upsert in batches
BATCH_SIZE = 100
for i in tqdm(range(0, len(vectors), BATCH_SIZE), desc="Upserting to Pinecone"):
    batch = [
        {
            "id": ids[j],
            "values": [float(x) for x in vectors[j]],
            "metadata": {"text": texts[j]},
        }
        for j in range(i, min(i + BATCH_SIZE, len(vectors)))
    ]
    index.upsert(vectors=batch)

print(f"Final index stats: {index.describe_index_stats()}")

# 7. Create vector store from existing index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=INDEX_NAME,
    embedding=embeddings
)

# 8. Test similarity search
query = "What are Allergies?"
docs = docsearch.similarity_search(query, k=3)

print("\n🔍 Test Query Results:")
for i, d in enumerate(docs):
    print(f"\n🔹 Result {i+1}:\n{d.page_content[:300]}...")

# 9. Setup the QA chain
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# 10. Initialize LLM
llm = CTransformers(
    model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama",
    config={
        'max_new_tokens': 512,
        'temperature': 0.8
    }
)

# 11. Create RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

# 12. Interactive loop
print("\n💬 Medical Chatbot Ready! (Type 'quit' to exit)")
while True:
    user_input = input("\nInput Prompt: ")
    
    if user_input.lower() in ['quit', 'exit', 'q']:
        print("Goodbye!")
        break
    
    if not user_input.strip():
        continue
    
    try:
        result = qa.invoke({"query": user_input})
        print(f"\n✅ Response: {result['result']}")
    except Exception as e:
        print(f"❌ Error: {str(e)}")

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


Length of my chunks: 5777
Embedding dimension: 384
Current index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5779}},
 'total_vector_count': 5779,
 'vector_type': 'dense'}
✅ Embedding dimensions: 384


Upserting to Pinecone: 100%|██████████| 58/58 [00:51<00:00,  1.13it/s]


Final index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5779}},
 'total_vector_count': 5779,
 'vector_type': 'dense'}

🔍 Test Query Results:

🔹 Result 1:
reaction. Allergic rhinitis is characterized by an itchy,
runny nose, often with a scratchy or irritated throat due
to post-nasal drip. Inflammation of the thin membrane
covering the eye (allergic conjunctivitis) causes redness,
irritation, and increased tearing in the eyes. Asthma caus-
es wheezing...

🔹 Result 2:
reactions is triggered by harmless, everyday substances.
This is the condition known as allergy, and the offend-
ing substance is called an allergen. Common inhaled
allergens include pollen, dust, and insect parts from tiny
house mites. Common food allergens include nuts, fish,
and milk.
Allergic re...

🔹 Result 3:
to commonly encountered environmental substances.
Purpose
Allergy is a reaction of the immune system. Nor-
mally, the immune system responds to fore