In [43]:
import os
import time

In [142]:
import logging
import numpy as np
from typing import List, Optional
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer


In [107]:
os.environ['PINECONE_API_KEY'] = "2a240c0f-452f-4729-a124-1b8d4640bd7d"

In [67]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def pdf_loader(path: str, glob_pattern: str = "*.pdf", loader_cls=PyMuPDFLoader) -> Optional[List[dict]]:

    try:
        logger.info(f"Starting to load documents from '{path}' with pattern '{glob_pattern}'")
        
        # Check if the directory exists
        if not os.path.isdir(path):
            logger.error(f"The directory '{path}' does not exist.")
            return None
        
        # Load the PDF files
        loader = DirectoryLoader(path, glob=glob_pattern, loader_cls=loader_cls)
        documents = loader.load()

        logger.info(f"Successfully loaded {len(documents)} documents.")

        return documents

    except FileNotFoundError as fnf_error:
        logger.error(f"File not found error: {fnf_error}")
    except AttributeError as attr_error:
        logger.error(f"Attribute error: {attr_error}. Check the structure of loaded documents.")
    except Exception as e:
        logger.error(f"An error occurred while loading PDF: {e}")

    return None


In [68]:
extracted_data = pdf_loader("D:\\Gen_AI\\END-TO-END-GenAI-RAG-APP\\data")

INFO:__main__:Starting to load documents from 'D:\Gen_AI\END-TO-END-GenAI-RAG-APP\data' with pattern '*.pdf'
INFO:__main__:Successfully loaded 2 documents.


In [136]:
def text_split(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=120, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data)
    
    return text_chunks

In [144]:
txt_chunks = text_split(extracted_data)

In [145]:
txt = [doc.page_content for doc in txt_chunks]

In [146]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# embeddings = model.encode(txt)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [147]:
# Generate embeddings for the text
embeddings = model.encode(txt, convert_to_tensor=True)  # Use convert_to_tensor if needed

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.05it/s]


In [149]:
metadatas = [{'source': doc.metadata['source']} for doc in txt_chunks]

In [150]:
index_name = "gen-ai-rag"
# DIMENTION = embeddings.shape[1] or 384

In [152]:
vectorstore_from_texts = PineconeVectorStore.from_documents(
        txt,
        embedding=embeddings.cpu().numpy().tolist(),  # Convert to list for Pinecone
        metadatas=metadatas,  # Pass metadata if needed
        index_name=index_name
    )

AttributeError: 'str' object has no attribute 'page_content'