<a href="https://colab.research.google.com/github/shivam110601/llama-2-rag/blob/main/RAG_with_Llama_2_GGML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Libraries and Modules

In [3]:
!pip install langchain transformers torch sentence_transformers chromadb bs4 unstructured langchain-community langchain-huggingface gradio ctransformers



## Setup

In [4]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.llms import CTransformers
from langchain_core.prompts import ChatPromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
import torch
import gradio as gr

## Initialize Embedding Model, Vector Store, Document loader & splitter and LLM

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [14]:
# Load documents
def load_documents(source):
    loader = PyPDFLoader(source)
    documents = loader.load()
    print(f"Loaded {len(documents)} documents from {source}")
    return documents

# Split documents
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    splits = text_splitter.split_documents(documents)
    print(f"Split into {len(splits)} chunks")
    return splits

In [7]:
def create_vector_store(splits):
    vector_store = Chroma.from_documents(splits, embeddings)
    print(f"Created vector store with {vector_store._collection.count()} embeddings")
    return vector_store

In [8]:
# Initialize GGML model
llm = CTransformers(
    model = "TheBloke/Llama-2-7B-Chat-GGML",
    model_file = "llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama"
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

## Prompt Design for RAG

In [9]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Don't make up any answer that is not in the "
    "context. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

## Retriever, Chain

In [35]:
def create_qa_chain(vector_store):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    print("Created retriever")

    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    print("Created question-answer chain")
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    print("Created retrieval chain")

    return rag_chain

In [15]:
document = load_documents("/content/8th mid sem REPORT one.pdf")
splits = split_documents(document)
vector_store = create_vector_store(splits)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6})
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "which deep learning models are used here?"})
print(response)


Loaded 28 documents from /content/8th mid sem REPORT one.pdf
Split into 50 chunks
Created vector store with 216 embeddings
{'input': 'which deep learning models are used here?', 'context': [Document(metadata={'page': 11, 'source': '/tmp/gradio/50d72c2827a9279baaf0499cdb2559e0616547eacc259d60c09debb8fa3b6286/8th mid sem REPORT one.pdf'}, page_content='CNN; implementation of RNN and traditional ML algorithms were more in \ncomparison.  \n• Majority of existing studies focused on binary classification.  \n• This project aims to explore the use of different CNN architectures to come \nup with a multi -class classification model.'), Document(metadata={'page': 11, 'source': '/content/8th mid sem REPORT one.pdf'}, page_content='CNN; implementation of RNN and traditional ML algorithms were more in \ncomparison.  \n• Majority of existing studies focused on binary classification.  \n• This project aims to explore the use of different CNN architectures to come \nup with a multi -class classificat

In [1]:
def handle_input(doc_file, user_query):
    if doc_file:
        document = load_documents(doc_file)
    else:
        return "Please provide a document or URL.", None

    if not document:
        return "Document could not be loaded.", None

    splits = split_documents(document)
    print("Split documents")

    vector_store = create_vector_store(splits)
    print("Created vector store")

    # retriever qa
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    print("Created retriever")

    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    print("Created question-answer chain")

    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    print("Created retrieval chain")

    # Get the answer from the RAG model
    response = rag_chain.invoke({"input": user_query})
    print("Got response from RAG model")

    result = response['answer']
    docs = response["context"]

    return result, docs

In [10]:
# Define Gradio inputs and outputs
with gr.Blocks() as demo:
    gr.Markdown("# LLaMA 2 7B GGML RAG Application")

    doc_file = gr.File(label="Upload Document (PDF)")

    user_query = gr.Textbox(label="Your Question")

    output = gr.Textbox(label="Response")
    docs = gr.Textbox(label="Retrieved Documents")
    submit_button = gr.Button("Submit")


    # Set up Gradio function interaction
    submit_button.click(fn=handle_input, inputs=[doc_file, user_query], outputs=[output, docs])


In [12]:
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://6267425cab55249636.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Loaded 28 documents from /tmp/gradio/50d72c2827a9279baaf0499cdb2559e0616547eacc259d60c09debb8fa3b6286/8th mid sem REPORT one.pdf
Split into 83 chunks
Split documents
Created vector store with 83 embeddings
Created vector store
Created retriever
Created question-answer chain
Created retrieval chain
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6267425cab55249636.gradio.live




In [25]:
demo.close()

Closing server running on port: 7860


In [11]:
gr.close_all()

In [None]:
# Embeddings and vectorstore setup
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def create_vector_store(texts):
    vector_store = FAISS.from_documents(texts, embeddings)
    return vector_store

def create_qa_chain(vector_store):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=model,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]