In [2]:
!pip install -qU \
  transformers==4.40.0 \
  langchain==0.1.16 \
  sentence-transformers==2.6.0 \
  faiss-cpu==1.8.0 \
  accelerate==0.29.3 \
  bitsandbytes==0.43.0 \
  datasets==2.19.0 \
  pypdf

In [3]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [4]:
!pip install --upgrade --no-cache-dir "bitsandbytes>=0.43.2"

Collecting bitsandbytes>=0.43.2
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.43.0
    Uninstalling bitsandbytes-0.43.0:
      Successfully uninstalled bitsandbytes-0.43.0
Successfully installed bitsandbytes-0.46.0


In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [6]:
loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50
)
splits = text_splitter.split_documents(documents)


In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [10]:
!pip install --upgrade --no-cache-dir bitsandbytes accelerate

Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.7.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.29.3
    Uninstalling accelerate-0.29.3:
      Successfully uninstalled accelerate-0.29.3
Successfully installed accelerate-1.7.0


In [8]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [16]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

In [10]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = FAISS.from_documents(splits, embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff",
    return_source_documents=True
)

In [20]:
import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("transformers.generation.configuration_utils").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

In [26]:
import re

#Ask your question here
query = "What is retrieval-augmented generation?"

response = qa_chain.invoke({"query": query})

raw_output = response["result"]

# Look for 'Helpful Answer:' and print only what comes after
match = re.search(r"Helpful Answer:\s*(.*)", raw_output, re.DOTALL | re.IGNORECASE)
if match:
    answer = match.group(1).strip()
else:
    # Fallback: print the last paragraph if marker not found
    answer = raw_output.strip().split('\n')[-1]

print(answer)
for doc in response["source_documents"]:
    print(f"{doc.metadata['source']} (page {doc.metadata.get('page', '?')})")


Retrieval-augmented generation refers to a method used by large language models where they first retrieve relevant information from external sources using a retrieval system, and then generate responses based on this combined data. This approach can help improve the accuracy and relevance of the generated responses compared to solely relying on the model's internal knowledge.
https://arxiv.org/pdf/2303.08774.pdf (page 55)
https://arxiv.org/pdf/2303.08774.pdf (page 75)
https://arxiv.org/pdf/2303.08774.pdf (page 71)
