In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone


def read_doc(directory):
    all_docs = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            all_docs.extend(docs)
    return all_docs

docs = read_doc("documents/")
print("Documents loaded:", len(docs))


def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)

chunks = chunk_data(docs)
print("Chunks created:", len(chunks))


embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Test
vec = embeddings.embed_query("What is one stop centre?")
print("Embedding vector:", vec)
print("Vector dimensions:", len(vec))




Documents loaded: 3
Chunks created: 11
Embedding vector: [0.05176348611712456, -0.03107583522796631, -0.07335510104894638, -0.018481694161891937, -0.007414842955768108, 0.06512859463691711, -0.0017163017764687538, -0.0025140715297311544, 0.006159436888992786, 0.025846419855952263, -0.008622665889561176, -0.028125479817390442, -0.03134050965309143, 0.0029935238417237997, -0.014085759408771992, -0.009684490039944649, -0.0011472911573946476, -0.056019846349954605, 0.05678146332502365, -0.01560422033071518, 0.024302171543240547, 0.04136282205581665, 0.038397591561079025, 0.02910608798265457, -0.14755547046661377, 0.030727727338671684, -0.07850933074951172, 0.09171260893344879, 0.043839067220687866, -0.0769306868314743, -0.0010054270969703794, -0.0029152564238756895, 0.02314409427344799, 0.06398371607065201, -0.056194890290498734, 0.00035725338966585696, 0.0026570619083940983, 0.031058652326464653, 0.053027618676424026, 0.013280579820275307, 0.05356848984956741, -0.026735378429293633, 0.034

In [None]:

pc = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"])

index_name = "pinecone-chatbot"
index = pc.Index(index_name)

#Store embeddings in Pinecone
vectorstore = PineconeVectorStore.from_documents(
    chunks,
    embedding=embeddings,
    index_name=index_name
)

print("Vectorstore ready!")

Vectorstore ready!
