# RAG + LLM Assessment

In [6]:
!pip install transformers>=4.32.0 optimum>=1.12.0 > null
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ > null
!pip install langchain > null
!pip install chromadb > null
!pip install sentence_transformers > null # ==2.2.2
!pip install unstructured > null
!pip install pdf2image > null
!pip install pdfminer.six > null
!pip install unstructured-pytesseract > null
!pip install unstructured-inference > null
!pip install faiss-gpu > null
!pip install pikepdf > null
!pip install pypdf > null
!pip install accelerate > null
!pip install pillow_heif > null
!pip install -i https://pypi.org/simple/ bitsandbytes > null

In [None]:
import os
os.kill(os.getpid(), 9)

Steps:

1. Choose a domain and collect a suitable dataset of documents (at least 5 documents - PDFs or HTML pages) to serve as the knowledge base for your RAG system. Select one of the following topics:
   * latest scientific papers from arxiv.org,
   * fiction books released,
   * legal documents or,
   * social media posts.

   Make sure that the documents are newer then the training dataset of the applied LLM. (20 points)

2. Create three relevant prompts to the dataset, and one irrelevant prompt. (20 points)

3. Load an LLM with at least 5B parameters. (10 points)

4. Test the LLM with your prompts. The goal should be that without the collected dataset your model is unable to answer the question. If it gives you a good answer, select another question to answer and maybe a different dataset. (10 points)

5. Create a LangChain-based RAG system by setting up a vector database from the documents. (20 points)

6. Provide your three relevant and one irrelevant prompts to your RAG system. For the relevant prompts, your RAG system should return relevant answers, and for the irrelevant prompt, an empty answer. (20 points)

In [1]:
from google.colab import userdata
from google.colab import drive
from huggingface_hub import login
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.vectorstores import FAISS
from textwrap import fill
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, BitsAndBytesConfig

import os

import locale
locale.getpreferredencoding = lambda: "UTF-8"

**Model**

In [18]:
hugging_face_token = userdata.get('HF_TOKEN_R')
login(hugging_face_token, add_to_git_credential = True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [19]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_8bit = True)
model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map = 'auto',
            token = hugging_face_token,
            quantization_config = quantization_config,
            trust_remote_code = True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True, token = hugging_face_token)

gen_cfg = GenerationConfig.from_pretrained(model_name)
gen_cfg.max_new_tokens = 512
gen_cfg.temperature = 0.0
gen_cfg.return_full_text = True
gen_cfg.do_sample = True
gen_cfg.repetition_penalty = 1.11

pipe = pipeline (
    task = 'text-generation',
    model = model,
    tokenizer = tokenizer,
    generation_config = gen_cfg
)

llm = HuggingFacePipeline( pipeline = pipe )



config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]



model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
template = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

**Testing With Prompts**

In [None]:
# first prompt
prompt_one   = "Did Ambika Mod ever play Emma Morley from One Day?"
result = llm(prompt.format(text = prompt_one))
print(fill(result.strip(), width = 100))

In [None]:
# second prompt
prompt_two   = "In the movie Unfrosted, who are implicated in the assassination of Kennedy?"
result = llm(prompt.format(text = prompt_two))
print(fill(result.strip(), width = 100))

In [None]:
# third prompt
prompt_three  = "What is Baby Reindeer about?"
result = llm(prompt.format(text = prompt_three))
print(fill(result.strip(), width = 100))

In [None]:
# irrelevant prompt
prompt_irrelevant   = "What is artificial intelligence?"
result = llm(prompt.format(text = prompt_irrelevant))
print(fill(result.strip(), width = 100))

**Documents**

In [None]:
drive.mount('/content/drive')
info_folder = '/content/drive/My Drive/new_info_pdf/'
documents = os.listdir(info_folder)

filenames = [info_folder + document for document in documents]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
documents

['unfrosted.pdf',
 'baby_reindeer.pdf',
 'shogun.pdf',
 'one_day.pdf',
 'damsel.pdf']

**LangChain-based RAG System**

In [None]:
# load documents and split into chunks
loaders = [UnstructuredPDFLoader(filename) for filename in filenames]

chunked_pdf_doc = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    pdf_doc = loader.load()
    updated_pdf_doc = filter_complex_metadata(pdf_doc)
    print("Splitting text...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
    documents = text_splitter.split_documents(updated_pdf_doc)
    chunked_pdf_doc.extend(documents)

len(chunked_pdf_doc)

In [None]:
# vectorize the chunks, and store in a FAISS database
embeddings = HuggingFaceEmbeddings()
db_pdf = FAISS.from_documents(chunked_pdf_doc, embeddings)

In [None]:
# some prompt engineering = our new template
prompt_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to answer the question at the end. Do not use any other information. If you can't find the relevant information in the context, just return an empty answer. Don't try to make up an answer.

{context}<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
# new prompt
prompt = PromptTemplate (
    template = prompt_template,
    input_variables = ["context", "question"]
)

# retrievalQA function
Chain_web = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = db_pdf.as_retriever (
        search_type = "similarity_score_threshold",
        search_kwargs = {'k': 10, 'score_threshold': 0.1}
    ),
    chain_type_kwargs={"prompt": prompt},
)

**Testing the RAG System**

In [None]:
# first prompt
result = Chain_web.invoke(prompt_one)
print(fill(result['result'].strip(), width = 100))

In [None]:
# second prompt
result = Chain_web.invoke(prompt_two)
print(fill(result['result'].strip(), width = 100))

In [None]:
# third prompt
result = Chain_web.invoke(prompt_three)
print(fill(result['result'].strip(), width = 100))

In [None]:
# irrelevant prompt
result = Chain_web.invoke(prompt_irrelevant)
print(fill(result['result'].strip(), width = 100))