In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from pathlib import Path  


In [2]:
import os

BASE_DIR = r"C:\Users\admin\OneDrive\Desktop\llm_pdf"
PDF_DIR = os.path.join(BASE_DIR, "pdfs")

os.makedirs(PDF_DIR, exist_ok=True)

print("PDF folder created at:", PDF_DIR)


PDF folder created at: C:\Users\admin\OneDrive\Desktop\llm_pdf\pdfs


In [3]:
import os

PDF_DIR = r"C:\Users\admin\OneDrive\Desktop\llm_pdf\pdfs"

pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]

print("PDFs found:", pdf_files)
print("Total PDFs:", len(pdf_files))


PDFs found: ['1706.03762v7.pdf', '2303.08774v6.pdf', '2312.11805v4.pdf', '2407.21783v3 (1).pdf', '2501.12948v1.pdf']
Total PDFs: 5


In [4]:
from langchain.document_loaders import PyPDFLoader
import os

PDF_DIR = r"C:\Users\admin\OneDrive\Desktop\llm_pdf\pdfs"

documents = []

for pdf in os.listdir(PDF_DIR):
    if pdf.lower().endswith(".pdf"):
        pdf_path = os.path.join(PDF_DIR, pdf)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

print("Total pages loaded:", len(documents))


Total pages loaded: 319


chunking

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)

print(f"Split {len(documents)} pages into {len(chunks)} chunks")


Split 319 pages into 2477 chunks


Filtering  and Embedding

In [6]:
# 1. Clean text first
def clean_text(text):
    text = text.replace("<EOS>", "").replace("<pad>", "")
    text = text.replace("\n", " ").strip()
    return text

for chunk in chunks:
    chunk.page_content = clean_text(chunk.page_content)

# 2. Filter meaningful chunks
filtered_chunks = [chunk for chunk in chunks if len(chunk.page_content) > 50]
print("Chunks after filtering:", len(filtered_chunks))  # should match embeddings rows

# 3. Generate embeddings ONLY for filtered chunks
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedder.encode(
    [chunk.page_content for chunk in filtered_chunks],
    show_progress_bar=True
)

print("Embeddings shape:", embeddings.shape)
assert len(filtered_chunks) == embeddings.shape[0], "Chunks and embeddings count mismatch!"


Chunks after filtering: 2471


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 78/78 [05:32<00:00,  4.27s/it]


Embeddings shape: (2471, 384)


vectorStoreDB

In [7]:
import os
import uuid
import numpy as np
from typing import List, Any
import chromadb

class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "data"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")
        
        ids, metadatas, documents_text, embeddings_list = [], [], [], []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

    def show_persist_directory(self):
        print("Persist directory:", os.path.abspath(self.persist_directory))


In [8]:
vectorstore = VectorStore(collection_name="pdf_documents", persist_directory="data")
vectorstore.add_documents(filtered_chunks, embeddings)

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 8443
Adding 2471 documents to vector store...
Successfully added 2471 documents to vector store
Total documents in collection: 10914


In [9]:
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer


In [10]:
import chromadb

client = chromadb.PersistentClient(path="data")
collection = client.get_collection("pdf_documents")

print("Total vectors in collection:", collection.count())


Total vectors in collection: 10914


In [13]:
from dotenv import load_dotenv
import os

load_dotenv(r"C:\Users\admin\OneDrive\Desktop\llm_pdf\.env")

key = os.getenv("GROQ_API_KEY")
print(key)


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


In [15]:
from groq import Groq

client = Groq()

response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "user", "content": "Say hello in one sentence"}
    ],
    max_tokens=50
)

print(response.choices[0].message.content)


Hello, how can I assist you today?
