In [None]:
pip install langchain_community

In [None]:
pip install pypdf

In [None]:
pip install langchain_huggingface sentence-transformers

In [None]:
pip install langchain_qdrant

In [None]:
pip install chromadb

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def create_chunks_from_pdf(data_path, chunk_size, chunk_overlap):

   '''
   This function takes a directory of PDF files and creates chunks of text from each file.
   The text is split into chunks of size `chunk_size` with an overlap of `chunk_overlap`.
   This chunk is then converted into a langchain Document object.

   Args:
      data_path (str): The path to the directory containing the PDF files.
      chunk_size (int): The size of each chunk.
      chunk_overlap (int): The overlap between each chunk.

   Returns:
      docs (list): A list of langchain Document objects, each containing a chunk of text.
   '''

   # Load the documents from the directory
   loader = DirectoryLoader(data_path, loader_cls=PyPDFLoader)

   # Split the documents into chunks
   text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
      length_function=len,
      is_separator_regex=False,
   )
   docs = loader.load_and_split(text_splitter=text_splitter)
   return docs

data_path = "D:\RAG Project1\data"
chunk_size = 500
chunk_overlap = 50

docs = create_chunks_from_pdf(data_path, chunk_size, chunk_overlap)

In [70]:
docs[2].page_content

'less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-\nto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.0 after\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the literature.\n1 Introduction'

In [72]:
import os
from sentence_transformers import SentenceTransformer

# Set the environment variable for the cache directory to a valid path
cache_dir = "C:/Users/Bantu Sagar Kumar/transformers_cache"
os.environ["TRANSFORMERS_CACHE"] = cache_dir

# Create the cache directory if it doesn't exist
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# List of embedding models (make sure the model name exists on Hugging Face)
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'
#embedding_model = 'BAAI/bge-large-en'


# Load the embeddings model
embeddings = SentenceTransformer(embedding_model, cache_folder=cache_dir)
print("Embeddings model loaded successfully")


Embeddings model loaded successfully


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_models = ['BAAI/bge-large-en']

# Load the embeddings model
embedding = HuggingFaceEmbeddings(model_name=embedding_models[0])

In [91]:
embedding = HuggingFaceEmbeddings(model_name=embedding_models[0], cache_folder='./cache')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [92]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-large-en', cache_folder='./cache', model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [94]:
from langchain_qdrant import Qdrant

def index_documents_and_retrieve(docs, embeddings):

    '''
    This function uses the Qdrant library to index the documents using the chunked text and embeddings model.
    For the simplicity of the example, we are using in-memory storage only.

    Args:
    docs: List of documents generated from the document loader of langchain
    embeddings: List of embeddings generated from the embeddings model

    Returns:
    retriever: Qdrant retriever object which can be used to retrieve the relevant documents
    '''

    qdrant = Qdrant.from_documents(
        docs,
        embeddings,
        location=":memory:",  # Local mode with in-memory storage only
        collection_name="my_documents",
    )

    retriever = qdrant.as_retriever()

    return retriever

retriever = index_documents_and_retrieve(docs, embedding)

In [96]:
from langchain_community.chat_models import ChatOllama

model_id = "llama3:instruct"

# Load the Llama-3 model using the Ollama
llm = ChatOllama(model=model_id)

In [97]:
from langchain_core.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

def build_rag_chain(llm, retriever):

    '''
    This function builds the RAG chain using the LLM model and the retriever object. 
    The RAG chain is built using the following steps:
    1. Retrieve the relevant documents using the retriever object
    2. Pass the retrieved documents to the LLM model along with prompt generated using the context and question
    3. Parse the output of the LLM model

    Args:
    llm: LLM model object
    retriever: Qdrant retriever object

    Returns:
    rag_chain: RAG chain object which can be used to answer the questions based on the context
    '''
    
    template = """
        Answer the question based only on the following context:
        
        {context}
        
        Question: {question}
        """
    
    prompt = PromptTemplate(
        template=template,
        input_variables=["context","question"]
        )
    
    rag_chain = (
        {"context": retriever,  "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain

rag_chain = build_rag_chain(llm, retriever)

In [None]:
rag_chain.invoke('What is this document about?')