# RAG with Llama 2 and LangChain
Retrieval-Augmented Generation (RAG) is a technique that combines a retriever and a generative language model to deliver accurate response. It involves retrieving relevant information from a large corpus and then generating contextually appropriate responses to queries. Here we use the quantized version of the Llama 2 13B LLM with LangChain to perform generative QA with RAG. The notebook file has been tested in Google Colab with T4 GPU. Please change the runtime type to T4 GPU before running the notebook.

## Install Packages

In [1]:
!pip install transformers==4.37.2 optimum==1.12.0 --quiet
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --quiet
!pip install langchain==0.1.9 --quiet
# !pip install chromadb
!pip install sentence_transformers==2.4.0 --quiet
!pip install unstructured --quiet
!pip install pdf2image --quiet
!pip install pdfminer.six==20221105 --quiet
!pip install unstructured-inference --quiet
!pip install faiss-gpu==1.7.2 --quiet
!pip install pikepdf==8.13.0 --quiet
!pip install pypdf==4.0.2 --quiet
!pip install pillow_heif==0.15.0 --quiet
!pip install PyPDF2
!pip install streamlit
!pip install python-dotenv
!pip install llama-cpp-python

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.6/380.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [2]:
# from langchain.document_loaders import UnstructuredPDFLoader
# from langchain.vectorstores.utils import filter_complex_metadata # 'filter_complex_metadata' removes complex metadata that are not in str, int, float or bool format

# pdf_loader = UnstructuredPDFLoader("/content/drive/MyDrive/Colab Notebooks/ISLP_website.pdf")
# pdf_doc = pdf_loader.load()
# updated_pdf_doc = filter_complex_metadata(pdf_doc)


# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
# chunked_pdf_doc = text_splitter.split_documents(updated_pdf_doc)
# len(chunked_pdf_doc)

# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings()

# from langchain.vectorstores import FAISS
# db_pdf = FAISS.from_documents(chunked_pdf_doc, embeddings)

# db_pdf.save_local("faiss_index")
# !cp -r "/content/faiss_index" "/content/drive/MyDrive/faiss_index"

In [3]:
%%writefile app_final.py
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
import os
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate


# Document class to wrap text chunks with page_content and metadata
class Document:
    def __init__(self, text, metadata=None):
        self.page_content = text
        self.metadata = metadata if metadata is not None else {}

def get_pdf_text(pdf):
    text = ""
    pdf_reader = PdfReader(pdf)
    for page_number, page in enumerate(pdf_reader.pages):
        text += page.extract_text() or ""
        # if page_number >= 2:  # Extract text from the first 3 pages
        #     break
    return text

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)

def load_or_create_vectorstore(vectorstore_path="faiss_index"):
    embeddings = HuggingFaceEmbeddings()
    docsearch = FAISS.load_local("/content/drive/MyDrive/faiss_index", embeddings, allow_dangerous_deserialization='True')
    return docsearch

def get_conversation_chain(vectorstore):
    model_name = "TheBloke/Llama-2-70B-Chat-GPTQ"

    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                device_map="auto",
                                                trust_remote_code=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    gen_cfg = GenerationConfig.from_pretrained(model_name)
    gen_cfg.max_new_tokens=512
    gen_cfg.temperature=0.0000001 # 0.0
    gen_cfg.return_full_text=True
    gen_cfg.do_sample=True
    gen_cfg.repetition_penalty=1.11

    pipe=pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        generation_config=gen_cfg
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    template="""
<s>[INST] <<SYS>>
Use the following context to Answer the question at the end. Do not use any other information. If you can't find the relevant information in the context, just say you don't have enough information to answer the question. Don't try to make up an answer.

<</SYS>>

{context}

Question: {question} [/INST]
    """

    prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt}
)
    return conversation_chain

import base64

# Function to convert images to Base64
def get_base64_encoded_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# Convert and store Base64 strings
chatbot_icon_base64 = get_base64_encoded_image("/content/drive/MyDrive/Chatbot icon.png")
avatar_cat_base64 = get_base64_encoded_image("/content/drive/MyDrive/Avatar Cat.png")

# Replace in templates when rendering
# Make sure to replace {CHATBOT_ICON} and {AVATAR_CAT} with actual base64 data

def handle_user_input(user_question, conversation_chain):
    response = conversation_chain({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            # st.markdown(user_template.replace("{{MSG}}", message.content).replace("Avatar Cat.png", f"data:image/png;base64,{avatar_cat_base64}"), unsafe_allow_html=True)
            st.markdown(user_template.replace("{AVATAR_CAT}", f"data:image/png;base64,{avatar_cat_base64}").replace("{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            # st.markdown(bot_template.replace("{{MSG}}", message.content).replace("Chatbot icon.png", f"data:image/png;base64,{chatbot_icon_base64}"), unsafe_allow_html=True)
            st.markdown(bot_template.replace("{CHATBOT_ICON}", f"data:image/png;base64,{chatbot_icon_base64}").replace("{{MSG}}", message.content), unsafe_allow_html=True)

def main():
    load_dotenv()
    st.set_page_config(page_title="CortexML")
    st.markdown(css, unsafe_allow_html=True)
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

    st.header("CortexML")
    st.subheader("The only chatbot you need for Machine Learning concepts")
    user_question = st.text_input("Ask a question about Machine Learning concepts", key="user_input")

    vectorstore_path = "faiss_index"
    vectorstore = load_or_create_vectorstore(vectorstore_path)

    if "conversation" not in st.session_state:
        st.info("Initializing conversation chain...")
        st.session_state.conversation = get_conversation_chain(vectorstore)
        st.success("Conversation chain initialized.")

    if user_question:
        handle_user_input(user_question, st.session_state.conversation)

if __name__ == '__main__':
    main()

Writing app_final.py


In [4]:
%%writefile htmlTemplates.py
css = '''
<style>
.chat-message {
    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
}
.chat-message.user {
    background-color: #2b313e
}
.chat-message.bot {
    background-color: #475063
}
.chat-message .avatar {
  width: 20%;
}
.chat-message .avatar img {
  max-width: 78px;
  max-height: 78px;
  border-radius: 50%;
  object-fit: cover;
}
.chat-message .message {
  width: 80%;
  padding: 0 1.5rem;
  color: #fff;
}
'''

# bot_template = '''
# <div class="chat-message bot">
#     <div class="avatar">
#         <img src="Chatbot icon.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
#     </div>
#     <div class="message">{{MSG}}</div>
# </div>
# '''

# user_template = '''
# <div class="chat-message user">
#     <div class="avatar">
#         <img src="Avatar Cat.png">
#     </div>
#     <div class="message">{{MSG}}</div>
# </div>
# '''

# Assuming these placeholders in your templates
bot_template = '''
<div class="chat-message bot">
    <div class="avatar">
        <img src="{CHATBOT_ICON}" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
    </div>
    <div class="message">{{MSG}}</div>
</div>
'''

user_template = '''
<div class="chat-message user">
    <div class="avatar">
        <img src="{AVATAR_CAT}">
    </div>
    <div class="message">{{MSG}}</div>
</div>
'''

Writing htmlTemplates.py


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:

!wget -q -O - ipv4.icanhazip.com

104.155.194.33


In [None]:
!streamlit run app_final.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.155.194.33:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.741s
your url is: https://tough-cougars-burn.loca.lt

`from langchain_community.embeddings import HuggingFaceEmbeddings`.

To install langchain-community run `pip install -U langchain-community`.

`from langchain_community.vectorstores import FAISS`.

To install langchain-community run `pip install -U langchain-community`.

`from langchain_community.llms import HuggingFacePipeline`.

To install langchain-community run `pip install -U langchain-community`.
2024-03-18 03:54:45.181709: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-1