# RAG Implementation - Intellihack Scope 03

This notebook implements a Retrieval-Augmented Generation (RAG) system that combines our fine-tuned model with a document retrieval system to provide more accurate and up-to-date answers.

In [None]:
# Import necessary libraries
import os
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

# Import RAG components
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.schema.document import Document
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnableParallel

# Import Hugging Face components
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline

# For visualizing embeddings (optional)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Set paths
data_dir = Path('../data/raw')
models_dir = Path('../models')
rag_dir = Path('../models/rag')
rag_dir.mkdir(parents=True, exist_ok=True)

## Document Processing for RAG

We'll process the documents for our RAG system.

In [None]:
# Load documents for RAG
def get_documents():
    """Load and prepare documents for RAG"""
    # Process raw text files
    md_files = list(data_dir.glob('**/*.md'))
    txt_files = list(data_dir.glob('**/*.txt'))
    all_files = md_files + txt_files
    
    documents = []
    
    for file_path in tqdm(all_files, desc="Loading documents"):
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            metadata = {
                "source": str(file_path),
                "filename": file_path.name,
                "topic": file_path.stem
            }
            document = Document(page_content=content, metadata=metadata)
            documents.append(document)
    
    return documents

# Load documents
documents = get_documents()
print(f"Loaded {len(documents)} documents")

In [None]:
# Split documents into smaller chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""]
)

# Process documents into chunks
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} document chunks")

## Create Embeddings and Vector Store

In [None]:
# Initialize the embedding model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Efficient model for embedding
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

In [None]:
# Create and save the FAISS vector store
vector_store = FAISS.from_documents(chunks, embeddings)
vector_store.save_local(rag_dir / "faiss_index")

print(f"Vector store created with {len(chunks)} chunks and saved to {rag_dir / 'faiss_index'}")

## Load Fine-tuned Model for RAG

In [None]:
# Load the fine-tuned model
model_path = models_dir / "qwen-3b-ai-research-qa"
base_model_id = "Qwen/Qwen2.5-3B-Instruct"

# Check if model exists
if model_path.exists():
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    
    # Load base model and apply fine-tuned weights
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    
    print("Fine-tuned model loaded successfully")
else:
    print(f"Fine-tuned model not found at {model_path}. Using base model directly.")
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True
    )

In [None]:
# Create a Hugging Face pipeline for text generation
generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.7,
    max_new_tokens=512,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
    return_full_text=False
)

# Create a LangChain HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=generation_pipeline)

# Create a retriever
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Retrieve top 3 most similar documents
)

## Create RAG Prompt Template

In [None]:
# Define the RAG prompt template
template = """
You are an AI assistant specialized in answering questions about AI research, distributed systems, and file systems.
You have been specifically trained on topics like the DeepSeek-V3 model, the Fire-Flyer File System (3FS), and AI optimization techniques.

Use the following pieces of retrieved context to answer the question. If you don't know the answer based on the context, say that you don't know.

Context: {context}

Question: {question}

Answer:
"""

# Create prompt from template
rag_prompt = PromptTemplate.from_template(template)

# Format the retrieved documents function
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

## Create RAG Chain

In [None]:
# Define the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

## Test RAG System

In [None]:
# Test questions
test_questions = [
    "What is DualPipe and how does it improve training efficiency?",
    "Explain the architecture of Fire-Flyer File System (3FS).",
    "How does the DeepSeek-V3 model use Mixture-of-Experts architecture?",
    "What are the advantages of the Chain Replication with Apportioned Queries (CRAQ) in 3FS?"
]

# Test the RAG system
for question in test_questions:
    print("="*80)
    print(f"Question: {question}")
    print("-"*80)
    print("Thinking...")
    response = rag_chain.invoke(question)
    print(f"Response: {response}")
    print("="*80)
    print()

## Compare RAG with Direct Model Answers

In [None]:
# Function to generate direct model responses
def generate_direct_response(question):
    messages = [{"role": "user", "content": question}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        inputs=inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    return response

# Compare RAG vs. Direct for a sample question
sample_question = "How does Expert Parallelism Load Balancing work in DeepSeek-V3?"

print("="*80)
print(f"Question: {sample_question}")
print("="*80)
print("Direct Model Response:")
direct_response = generate_direct_response(sample_question)
print(direct_response)
print("="*80)
print("RAG System Response:")
rag_response = rag_chain.invoke(sample_question)
print(rag_response)
print("="*80)

## Analyze Retrieved Documents

In [None]:
# Function to inspect retrieved documents
def inspect_retrieved_docs(query, top_k=3):
    retrieved_docs = retriever.get_relevant_documents(query)
    
    print(f"Retrieved {len(retrieved_docs)} documents for query: '{query}'")
    for i, doc in enumerate(retrieved_docs[:top_k]):
        print(f"\nDocument {i+1}:")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content preview (first 300 chars):")
        print("-"*80)
        print(doc.page_content[:300], "...")
        print("-"*80)
    
    return retrieved_docs

# Test with a specific query
test_query = "Explain how DualPipe reduces pipeline bubbles during training"
retrieved_docs = inspect_retrieved_docs(test_query)

## Visualize Document Embeddings (Optional)

In [None]:
# Sample a subset of documents for visualization
max_docs = 100  # Limit to prevent cluttering the visualization
sample_chunks = chunks[:min(max_docs, len(chunks))]

# Get embeddings for the documents
sample_texts = [doc.page_content for doc in sample_chunks]
sample_embeddings = embeddings.embed_documents(sample_texts)

# Extract topics for coloring
topics = [doc.metadata.get('topic', 'unknown') for doc in sample_chunks]
unique_topics = list(set(topics))
topic_to_color = {topic: i for i, topic in enumerate(unique_topics)}
colors = [topic_to_color[topic] for topic in topics]

# Use t-SNE to reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(sample_embeddings)

# Plot
plt.figure(figsize=(12, 10))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors, alpha=0.6, s=100)

# Add legend
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                             markerfacecolor=plt.cm.tab10(topic_to_color[topic]), 
                             markersize=10, label=topic) 
                  for topic in unique_topics]
plt.legend(handles=legend_elements, title="Topics")

plt.title("Document Embeddings Visualization")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.grid(alpha=0.3)
plt.show()

## Save the RAG System Components for Deployment

In [None]:
# Save the RAG template
with open(rag_dir / "rag_template.txt", "w") as f:
    f.write(template)

# Save a simple config file for the RAG system
rag_config = {
    "embedding_model": embedding_model_name,
    "vector_store_path": str(rag_dir / "faiss_index"),
    "llm_model_path": str(model_path),
    "base_model_id": base_model_id,
    "retriever_k": 3,
    "generation_params": {
        "temperature": 0.7,
        "max_new_tokens": 512,
        "top_p": 0.9,
        "repetition_penalty": 1.1
    }
}

with open(rag_dir / "rag_config.json", "w") as f:
    json.dump(rag_config, f, indent=2)

print(f"RAG system components saved to {rag_dir}")

## Create a Simple RAG Demo Function

In [None]:
def rag_demo(question):
    """
    Demo function for the RAG system that shows the retrieved context and the generated answer.
    """
    # Get retrieved documents
    docs = retriever.get_relevant_documents(question)
    context = format_docs(docs)
    
    # Format the prompt
    formatted_prompt = rag_prompt.format(context=context, question=question)
    
    # Generate the answer
    response = llm(formatted_prompt)
    
    print("="*80)
    print(f"Question: {question}")
    print("="*80)
    print("Retrieved Context:")
    print("-"*80)
    print(context[:500], "...\n(truncated)")
    print("-"*80)
    print("Generated Answer:")
    print("-"*80)
    print(response)
    print("="*80)
    
    return response

# Test the demo function
demo_question = "What are the key innovations in DeepSeek-V3 that made it more efficient to train?"
rag_demo(demo_question)

## Conclusion

In this notebook, we've implemented a complete RAG system that combines our fine-tuned model with document retrieval to provide accurate answers to technical questions. This approach enhances the model's capabilities by grounding its responses in the specific technical documents provided.

The RAG system includes:
1. Document processing and chunking
2. Vector embeddings and similarity search
3. Integration with our fine-tuned model
4. A customized prompt template for high-quality responses

This implementation can be further extended with a web interface or API for practical deployment.