In [1]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [8]:
PINECONE_API_KEY = "pcsk_6vak7A_QZQEv68cSuCxJjXkuFDQ1Y9q9Dt4rFJ5nsXwr8Hm1BTBsKQU541hnCqSj5o3GiT"
PINECONE_API_ENV = "us-east-1-aws"
INDEX_NAME = "medicalbot"

In [9]:
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)


In [10]:
index = pc.Index("medicalbot")


In [11]:
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [12]:
extracted_data = load_pdf("../data")
print(f"Loaded {len(extracted_data)} documents from PDF files.")

Loaded 637 documents from PDF files.


In [26]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [27]:
text_chunks = text_split(extracted_data)
print("Length of text chunks:", len(text_chunks))

Length of text chunks: 15616


In [28]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

In [25]:
# docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [29]:
def store_embeddings_in_batches(index, text_chunks, embeddings, batch_size=100):
    """
    Store embeddings in Pinecone in smaller batches to avoid exceeding size limits.
    """
    print("Storing embeddings in batches...")
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i + batch_size]
        vectors = [
            (f"chunk-{i+j}", embeddings.embed_documents([chunk.page_content])[0], {"text": chunk.page_content})
            for j, chunk in enumerate(batch)
        ]
        try:
            index.upsert(vectors)
            print(f"Stored batch {i // batch_size + 1} with {len(vectors)} embeddings.")
        except Exception as e:
            print(f"Error storing batch {i // batch_size + 1}: {e}")


In [30]:
store_embeddings_in_batches(index, text_chunks, embeddings, batch_size=50)


Storing embeddings in batches...
Stored batch 1 with 50 embeddings.
Stored batch 2 with 50 embeddings.
Stored batch 3 with 50 embeddings.
Stored batch 4 with 50 embeddings.
Stored batch 5 with 50 embeddings.
Stored batch 6 with 50 embeddings.
Stored batch 7 with 50 embeddings.
Stored batch 8 with 50 embeddings.
Stored batch 9 with 50 embeddings.
Stored batch 10 with 50 embeddings.
Stored batch 11 with 50 embeddings.
Stored batch 12 with 50 embeddings.
Stored batch 13 with 50 embeddings.
Stored batch 14 with 50 embeddings.
Stored batch 15 with 50 embeddings.
Stored batch 16 with 50 embeddings.
Stored batch 17 with 50 embeddings.
Stored batch 18 with 50 embeddings.
Stored batch 19 with 50 embeddings.
Stored batch 20 with 50 embeddings.
Stored batch 21 with 50 embeddings.
Stored batch 22 with 50 embeddings.
Stored batch 23 with 50 embeddings.
Stored batch 24 with 50 embeddings.
Stored batch 25 with 50 embeddings.
Stored batch 26 with 50 embeddings.
Stored batch 27 with 50 embeddings.
Stor