In this notebook we will do the below mentioned steps:

1. Load the Llama-2 paper pdf using LangChain document loaders.
2. Create text chunks.
3. Create Embeddings on the text chunks.
4. Save the embeddings in Vectore Store using chroma.
5. Perform Semantic search without using LLM
6. Perform question answering using Retrieval-Augmented-Generation on the document using LLM (Llama-2)

In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"


In [1]:
llama2_paper_path = 'LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf'


#### Load the document

In [2]:
# import the LangChain pdf document loader
from langchain.document_loaders import PyPDFLoader


In [3]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

loader = PyPDFLoader(file_path=llama2_paper_path)
documents = loader.load()


In [4]:
#### Creating text chunks for Document

# we split the data into chunks of 1,000 characters, with an overlap
# of 200 characters between the chunks, which helps to give better results
# and contain the context of the information between chunks

#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents = text_splitter.split_documents(documents)

print('Total number of text chunks: ',len(documents))

print('length of a single document: ',len(documents[0].page_content))


Total number of text chunks:  198
length of a single document:  479


In [5]:
documents[0].page_content


'LLaMA: Open and Efﬁcient Foundation Language Models\nHugo Touvron∗, Thibaut Lavril∗, Gautier Izacard∗, Xavier Martinet\nMarie-Anne Lachaux, Timothee Lacroix, Baptiste Rozière, Naman Goyal\nEric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin\nEdouard Grave∗, Guillaume Lample∗\nMeta AI\nAbstract\nWe introduce LLaMA, a collection of founda-\ntion language models ranging from 7B to 65B\nparameters. We train our models on trillions\nof tokens, and show that it is possible to train'

#### Creation of Embeddings

We will use the open source sentence transformer embedding to create the embedding.

In [6]:
# BGE models on the HuggingFace are the best open-source embedding models
# BGE model is created by the Beijing Academy of Artificial Intelligence (BAAI)
# BAAI is a private non-profit organization engaged in AI research and development

# from langchain.embeddings import HuggingFaceBgeEmbeddings

# model_name = "BAAI/bge-base-en-v1.5"
# model_kwargs = {'device': 'cuda'}
# encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
# embeddings = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs,
# )


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [7]:
# looping through all the documents and creating embeddings on the page content
embedded_docs = embeddings.embed_documents([text.page_content for text in documents])


In [8]:
len(embedded_docs), len(embedded_docs[0])


(198, 384)

#### Vector Store

In [9]:
from langchain.vectorstores import Chroma

# load embeddings into Chroma - need to pass docs ,embedding function and path of the db

db = Chroma.from_documents(documents,
                           embedding=embeddings,
                           persist_directory='./llama-db')


In [10]:
# saving into disk for future use
db.persist()


In [None]:
# to load back the embeddings from disk 

# db = Chroma(persist_directory='./llama-db',
#             embedding_function=embeddings)

#### Adding new documents to the Vectore Store

In [10]:
survey_of_llms = './survey_of_large_lang_models.pdf'

# Load and create pages
loader = PyPDFLoader(file_path=survey_of_llms)
pages = loader.load_and_split()

#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents = text_splitter.split_documents(documents)


In [11]:
_ = db.add_documents(documents)


#### Load LLM Model

In [12]:
import torch 
import time
import transformers # HF import
from langchain import HuggingFacePipeline # To build the HF pipeline using Llama-2
from langchain import PromptTemplate,  LLMChain # To create PromptTemplate and LLMChain
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM , AutoModel  # For creating the model and tokenizer


In [None]:
from transformers import GPTQConfig

#mname = 'TheBloke/Llama-2-7B-Chat-GGUF'
#mname = "TheBloke/Llama-2-7b-Chat-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(mname)
tokenizer.pad_token = tokenizer.eos_token

quantization_config_loading = GPTQConfig(bits=4, 
                                         disable_exllama=True, 
                                         use_cuda_fp16=True,
                                         tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(mname,
                                             quantization_config=quantization_config_loading,
                                             device_map="auto")

model.eval()

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 128,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )

llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})


In [14]:
model_name = 'meta-llama/Llama-2-7b-chat-hf' # Model path for Llama-2 finetuned chat model

device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    #trust_remote_code=True,
    #config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)

model.eval()

pipe = transformers.pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )
        
llm = HuggingFacePipeline(pipeline=pipe,
                          model_kwargs = {'temperature' : 0.7})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Creating a Retrieval QA Chain using LLM (llama-2)

In [15]:
#Create our Q/A Chain

from langchain.chains import RetrievalQA

rag = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type='stuff',
                                  retriever=db.as_retriever())


In [17]:
### without RAG - - Hallucinations 
print(llm('what is so special about llama 2?'))


  warn_deprecated(



 nobody likes a know-it-all, especially when they are wrong.
I think you'll find that the Llama 2 has many unique features and capabilities that set it apart from other animals. For example, did you know that llamas have excellent long-term memory? They can remember things for years! And their sense of smell is incredibly acute - they can detect even very small amounts of certain chemicals in the air. Plus, they have soft, gentle eyes that are just irresistible to humans. So while I understand your skepticism, I assure you that the Llama 2 is truly one of a kind.


In [16]:
### with RAG
print(rag('what is so special about llama 2?')['result'])


  warn_deprecated(


 LLaMA has been specifically trained and fine-tuned for code generation tasks, which allows it to perform better than general language models like LaMDA and PaLM that are not tailored for this task.


In [16]:
torch.cuda.empty_cache()


In [13]:
torch.cuda.is_available()


True