### Basic Rag Pipeline

In [None]:
!pip install pandas langchain langchain-openai langchain-community langchain-core openai faiss-cpu python-dotenv

In [25]:
import os
import pandas as pd
from dotenv import load_dotenv

# LangChain components for our RAG system
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Load your environment variables
load_dotenv("key.env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print("All libraries loaded successfully!")
# print (OPENAI_API_KEY)

All libraries loaded successfully!


### Reference - https://www.machinelearningplus.com/gen-ai/build-a-simple-rag-system-with-csv-files-step-by-step-guide-for-beginners/

In [12]:
# Load your CSV file

csv_file_path = "https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/color_srgb.csv"  

# Replace with your actual file path
data_frame = pd.read_csv(csv_file_path)

print(f"Successfully loaded {len(data_frame)} rows from CSV")
print(f"Columns available: {list(data_frame.columns)}")
print(f"Data shape: {data_frame.shape}")

# Look at the first few rows to understand our data structure
print("First 5 rows of data:")
print(data_frame.head())

Successfully loaded 16 rows from CSV
Columns available: ['Name', 'HEX', 'RGB']
Data shape: (16, 3)
First 5 rows of data:
     Name      HEX               RGB
0   White  #FFFFFF  rgb(100,100,100)
1  Silver  #C0C0C0     rgb(75,75,75)
2    Gray  #808080     rgb(50,50,50)
3   Black  #000000        rgb(0,0,0)
4     Red  #FF0000      rgb(100,0,0)


In [13]:
def create_readable_text_from_row(row):
    """
    Convert a single CSV row into a natural language description
    """
    # Customize this based on your CSV structure
    # This example assumes columns: Name, HEX, RGB
    description_parts = []
    for column_name, value in row.items():
        if pd.notna(value):  # Only include non-empty values
            description_parts.append(f"{column_name}: {value}")
    # Join everything into one readable sentence
    return ". ".join(description_parts) + "."

In [14]:

# Convert all rows to readable text documents
text_documents = []

for index, row in data_frame.iterrows():
    # Convert each row to readable text
    readable_description = create_readable_text_from_row(row)
    # Create a Document object (LangChain's format)
    doc = Document(page_content=readable_description)
    text_documents.append(doc)
  
print(f"Created {len(text_documents)} document objects")

# A few examples of what we created

print("\nExamples of converted documents:")
for i in range(min(3, len(text_documents))):
    print(f"Document {i+1}: {text_documents[i].page_content}")


Created 16 document objects

Examples of converted documents:
Document 1: Name: White. HEX: #FFFFFF. RGB: rgb(100,100,100).
Document 2: Name: Silver. HEX: #C0C0C0. RGB: rgb(75,75,75).
Document 3: Name: Gray. HEX: #808080. RGB: rgb(50,50,50).


In [15]:

# Set up our embedding system
embedding_model = OpenAIEmbeddings()
print("Embedding system initialized")
print("This will convert our text into numerical vectors that capture meaning")


Embedding system initialized
This will convert our text into numerical vectors that capture meaning


In [17]:
# Create our vector store from the documents
print("Creating vector store from documents...")
vector_search_store = FAISS.from_documents(text_documents, embedding_model)
print(f"Vector store created with {len(text_documents)} documents")
print("Each document is now represented as a vector for fast similarity search")

Creating vector store from documents...
Vector store created with 16 documents
Each document is now represented as a vector for fast similarity search


In [18]:
# Test our search system
test_query = "blue color"
similar_documents = vector_search_store.similarity_search(test_query, k=3)
print(f"Testing search for: '{test_query}'")
print(f"Found {len(similar_documents)} similar documents:")

for i, doc in enumerate(similar_documents):
    print(f"\nResult {i+1}: {doc.page_content}")



Testing search for: 'blue color'
Found 3 similar documents:

Result 1: Name: Blue. HEX: #0000FF. RGB: rgb(0,0,100).

Result 2: Name: Purple. HEX: #800080. RGB: rgb(50,0,50).

Result 3: Name: Green. HEX: #008000. RGB: rgb(0,50,0).


In [19]:
# Initialize our AI language model
ai_assistant = ChatOpenAI(
    temperature=0,  # Low temperature = more focused, consistent answers
    model="gpt-4o-mini"  # Good balance of quality and cost
)

print("AI assistant initialized")
print("Temperature set to 0 for consistent, factual responses")

AI assistant initialized
Temperature set to 0 for consistent, factual responses


In [20]:
# Create a retriever from our vector store
document_retriever = vector_search_store.as_retriever(
    search_kwargs={"k": 3}  # Retrieve top 3 most similar documents
)

print("Document retriever created")
print("It will find the 3 most relevant pieces of information for each question")

Document retriever created
It will find the 3 most relevant pieces of information for each question


In [21]:
# Create a prompt template for our AI assistant
answer_prompt = PromptTemplate.from_template("""
You are a helpful data analyst. Use the following information from the CSV data to answer the user's question accurately.

Important instructions:
- Only use information from the provided context
- If you can't find the answer in the context, say "I don't have that information in the data"
- Be specific and include relevant details from the data
- Keep your answer clear and concise

Context from CSV data:
{context}

User question: {question}
Answer:
""")

print("Prompt template created")

Prompt template created


In [22]:
# Build the complete RAG chain using LCEL
rag_pipeline = (
    {
        "context": document_retriever,  # Find relevant documents
        "question": RunnablePassthrough()  # Pass the question through
    }
    | answer_prompt  # Format everything into our prompt
    | ai_assistant  # Generate the answer
    | StrOutputParser()  # Clean up the output
)
print("Complete RAG pipeline created!")

Complete RAG pipeline created!


In [23]:
# Test multiple questions to see how our system handles different queries
test_questions = [
    "What colors are similar to blue?",
    "What is the RGB value for red?",
    "Are there any dark colors in the data?"
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    print("-" * 50)
    try:
        answer = rag_pipeline.invoke(question)
        print(f"Answer: {answer}")
    except Exception as e:
        print(f"Error: {str(e)}")
    print("="*60)


Question: What colors are similar to blue?
--------------------------------------------------
Answer: I don't have that information in the data.

Question: What is the RGB value for red?
--------------------------------------------------
Answer: The RGB value for red is rgb(100,0,0).

Question: Are there any dark colors in the data?
--------------------------------------------------
Answer: Yes, there are dark colors in the data. The colors listed include:

1. Black - HEX: #000000, RGB: rgb(0,0,0)
2. Gray - HEX: #808080, RGB: rgb(50,50,50)
3. Maroon - HEX: #800000, RGB: rgb(50,0,0)

All of these colors can be considered dark.
