## Import library

In [None]:
import os
from dotenv import load_dotenv,find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import ArxivLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS,Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint


In [37]:
_ = load_dotenv(find_dotenv())
# current_dir = os.path.dirname(os.path.abspath(__file__))
CURRENT_DIR = os.getcwd()
print(f"Current directory : {CURRENT_DIR}")
DB_DIR = os.path.join(CURRENT_DIR, "db")

STORE_NAME = "faiss_store"
FILES_PATH = os.path.join(CURRENT_DIR,'docs')
PERSIST_DIR = os.path.join(DB_DIR,STORE_NAME)

# Choose a small HF model for quick embedding
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

Current directory : /home/sudhir/DataScience/genai-2025/RAG_research_assistant


In [43]:
PERSIST_DIR.split('/')[-1]

'faiss_store'

In [44]:
os.path.basename(PERSIST_DIR)

'faiss_store'

In [7]:
def load_document(files_path):

    print("\n--- Load Documents ---")
    if not os.path.exists(files_path):
        raise FileNotFoundError(
            f"The directory {files_path} does not exist"
        )
    
    # List all pdf files in the directory
    pdf_files = [f for f in os.listdir(files_path) if f.endswith(".pdf")]
    print(f"List of pdf files: {pdf_files}")

    # Read the text content from each file and store it with metadata
    documents = []
    for pdf_file in pdf_files:
        file_path = os.path.join(files_path, pdf_file)
        # load docs
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        for doc in docs:
            # Add metadata to each document indicating its source
            doc.metadata = {"source": pdf_file}
            documents.append(doc)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    docs = splitter.split_documents(documents)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")
    return docs

In [8]:
docs = load_document(FILES_PATH)


--- Load Documents ---
List of pdf files: ['GPT3-2005.14165v4.pdf', 'AttentionAllYouNeed-1706.03762v7.pdf']

--- Document Chunks Information ---
Number of document chunks: 369


## Vector Store

In [20]:
def get_embeddings(model_name):
    """Return HuggingFace embeddings instance."""
    return HuggingFaceEmbeddings(model_name=model_name)

def load_or_create_faiss(documents,store_name,db_dir,model_name):
    """Create a new FAISS store or update an existing one."""
    
    embeddings = get_embeddings(model_name)

    PERSIST_DIR = os.path.join(db_dir,store_name)

    if os.path.exists(PERSIST_DIR):
        print("Existing FAISS DB found. Loading...")
        vectorstore = FAISS.load_local(
            PERSIST_DIR,
            embeddings,
            allow_dangerous_deserialization=True
        )
        print("Adding new documents to existing DB...")
        vectorstore.add_documents(documents)
        vectorstore.save_local(PERSIST_DIR)

    else:
        print("Creating new FAISS DB...")
        vectorstore = FAISS.from_documents(documents, embeddings)
        vectorstore.save_local(PERSIST_DIR)

    print(" Vector store is ready!")
    return vectorstore

In [None]:
vectorstore = load_or_create_faiss(docs,STORE_NAME,DB_DIR,EMBED_MODEL)

Existing FAISS DB found. Loading...
Adding new documents to existing DB...
 Vector store is ready!


'/home/sudhir/DataScience/genai-2025/RAG_research_assistant/db'

In [None]:
def load_faiss():
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    if os.path.exists(PERSIST_DIR):
        return FAISS.load_local(
            PERSIST_DIR,
            embeddings,
            allow_dangerous_deserialization=True
        )
    return None

def file_exists_in_db(vectorstore, filename):
    """Check metadata in DB to see if filename already exists."""
    if vectorstore is None:
        return False

    # metadata is stored inside docstore
    for _id, doc in vectorstore.docstore._dict.items():
        if doc.metadata.get("source") == filename:
            return True

    return False

In [33]:
def vectorstore_retriver(store_name,db_dir,model_name):
    """Get vector store retriver """
    embeddings = get_embeddings(model_name)

    PERSIST_DIR = os.path.join(db_dir,store_name)

    if os.path.exists(PERSIST_DIR):
        vectorstore = FAISS.load_local(
            PERSIST_DIR,
            embeddings,
            allow_dangerous_deserialization=True
        )
        return vectorstore
    else:
        raise FileNotFoundError(f" The vector store {store_name}")

In [34]:
db = vectorstore_retriver(STORE_NAME,DB_DIR,EMBED_MODEL)

In [11]:
vectorstore.similarity_search_with_score('what is rag')

[(Document(id='765026b2-2213-4331-a97b-c88410f482f5', metadata={'source': 'GPT3-2005.14165v4.pdf'}, page_content='Winogrande dev acc 13 77.7 1267 - 0 77.7 1267 100% 0%\nTable C.1: Overlap statistics for all datasets sorted from dirtiest to cleanest. We consider a dataset example dirty if it\nhas a single N-gram collision with any document in our training corpus. “Relative Difference Clean vs All” shows the\npercent change in performance between only the clean examples vs all the examples in the benchmark. “Count” shows\nthe number of examples. “Clean percentage” is the percent of examples that are clean vs total. For “Acc/F1/BLEU” we\nuse the metric speciﬁed in “Metric”. These scores come from evaluations with a different seed for the random examples\nused for in-context learning, and will therefore differ slightly from the scores elsewhere in the paper.\n45'),
  1.5261037),
 (Document(id='80d279d9-d981-4682-81fc-74720132ad66', metadata={'source': 'GPT3-2005.14165v4.pdf'}, page_content

## LLM

In [17]:
# llm
llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
    do_sample=False,
    provider="auto",  # let Hugging Face choose the best provider for you
)

chat_model = ChatHuggingFace(llm=llm)

In [None]:
chat_model.invoke()

In [2]:
# ...existing code...
# from langchain_core.retrievers import RetrievalQA
from langchain.chains import RetrievalQA
# ...existing code...

ModuleNotFoundError: No module named 'langchain_core.memory'

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

# Initialize embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create FAISS vector store
db = FAISS.from_documents(documents, embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Function to create and persist vector store
def create_vector_store(docs, embeddings, store_name,db_dir):
    persistent_directory = os.path.join(db_dir,store_name)
    if not os.path.exists(persistent_directory):
        print(f"Creating vector store: {store_name}")
        Chroma.from_documents(docs,embeddings,persist_directory=persistent_directory)
        print(f"Finished creating vector store: {store_name}")
    else:
        print(f"Appending document to vector store: {store_name}")
        Chroma.add_documents(docs,embeddings)
        print(f"Finished creating vector store: {store_name}")


langchain_community.vectorstores.faiss.FAISS

In [2]:
base_docs = ArxivLoader(query="Retrieval Augmented Generation", load_max_docs=5).load()
len(base_docs)

3

In [10]:
docs1 = base_docs[0]

In [None]:
from langchain.prompts import PromptTemplate,ChatPromptTemplate

template = """ You are a research assistant. 

### Context:
{context}
### Question:
{question}
Use the following pieces of context to answer the question at the end."""
prompt = PromptTemplate(
    input_variables=["context", "question"], template=template
)
print(prompt.format(context=docs1.page_content, question="What is RAG?"))

 You are a research assistant. 

### Context:
arXiv:2506.06962v3  [cs.CV]  14 Jun 2025
AR-RAG: Autoregressive Retrieval Augmentation for
Image Generation
Jingyuan Qi* 1
Zhiyang Xu* 1
Qifan Wang2
Lifu Huang3
1Virginia Tech
2Meta
3 UC Davis
jingyq1@vt.edu
(a) Vanilla Image Generation
Prompt
(c) Patch-based Autoregressive Retrieval Augmentation (Ours)
...
Augmentation
Generation
Prompt
Generated Image
Generated Image
(b) Image-Based Retrieval Augmentation
Prompt
Generated Image
Retrieved Images
...
Query
Key
Query
Key
Query
Value
Key
Retrieval
Value
Value
?
?
Next Image Patch
?
Figure 1: Comparison between Autoregressive Retrieval Augmentation (AR-RAG) for image
generation in (c) and existing image generation paradigms in (a) (b). In AR-RAG, image patches in
red boxes denote retrieval queries and keys, image patches in blue boxes are retrieved values, and
gray boxes with the question mark are next image patches to be predicted. (Caption: A white cat is
playing basketball on the court.)
Ab