This code implements a basic Retrieval-Augmented Generation (RAG) system for processing and querying CSV documents. The system encodes the document content into a vector store, which can then be queried to retrieve relevant information.

The CSV file contains dummy customer data, comprising various attributes like first name, last name, company, etc. This dataset will be utilized for a RAG use case, facilitating the creation of a customer information Q&A system

In [1]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import FAISS

In [3]:
#Load csv data
file_path="data/customers-100.csv"
import pandas as pd
data=pd.read_csv(file_path)
data.head(2)

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/


Indexing

In [14]:
## Document loaders
loader=CSVLoader(file_path)
data=loader.load()

In [15]:
data[0].page_content

'Index: 1\nCustomer Id: DD37Cf93aecA6Dc\nFirst Name: Sheryl\nLast Name: Baxter\nCompany: Rasmussen Group\nCity: East Leonard\nCountry: Chile\nPhone 1: 229.077.5154\nPhone 2: 397.884.0519x718\nEmail: zunigavanessa@smith.info\nSubscription Date: 2020-08-24\nWebsite: http://www.stephenson.com/'

In [16]:
#Embeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
#Vectorstore
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss
index=faiss.IndexFlatL2(len(embeddings.embed_query(" ")))
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [24]:
vector_store.add_documents(documents=data)

['1a5c0716-71ed-417a-bb12-7b12b410c30d',
 'c6df4474-187f-42dd-917a-95d34a89d634',
 '3eda98cb-2a4d-4279-8464-412f8d1e2b16',
 '59577084-adbd-4f2a-93a8-4975c681cfc3',
 'ff3242f9-8a85-4eb1-a2c4-61ecbaae569b',
 '6f88298f-b1e4-467c-9d92-fe4bd57e98b4',
 'f2fdec0b-fa9f-402f-9879-6993d3583111',
 '85f296b9-9468-4f89-9b24-ee42b02196e6',
 '46c8997f-b312-4fe4-a585-0fa26cdf8865',
 '9f847712-61a0-49a5-afee-32d84df79655',
 'ec4ebe78-06e2-4446-a77e-eaf47a9be75e',
 'b393dc25-dbd1-44b7-9335-8a6d1bcd0e90',
 'f5d78139-6c15-46dc-b993-b7d79ede082b',
 'df3b6711-fb41-4590-a05b-aa2765993fd5',
 '67f34c4e-c3a2-45c3-b61c-0012129d1e9b',
 'b96ecaf8-c67d-4320-994c-fefa71793246',
 '63e81946-8cde-497e-9e6e-44ed32479e8e',
 'd4e018ed-df50-4d0e-a6da-fb2732c95f4e',
 'fa628b66-58aa-4554-a3c7-a119adf0ae81',
 '8e614acf-6e90-429f-b536-66d638601f4f',
 'b2843bbc-fe11-4faa-9415-9571276b5a2e',
 '7658eca3-88c7-4c0c-9094-fb9fd8f4ac3a',
 '17494165-f9d2-4a2f-9099-5447c4b35263',
 'c66fa184-e121-4789-9598-b5babf393a24',
 '92717a25-f5fb-

Retrieval

In [25]:
#Retriever
retriever=vector_store.as_retriever(search_kwargs={'k':2})

Augmentation

In [26]:
groq_api_key=os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Llama3-8b-8192")

In [27]:
from langchain_core.prompts import PromptTemplate
prompt=PromptTemplate(
    template=""" 
    You are assistant for question answering tasks.
    Use the following piece of retreived context to answer
    the question.If you don't know the answer, say that you don't know.
    keep the answer concise.
    {context}
    Question:{question}
    """,
    input_variables=['context','question']
)

In [28]:
#Building chain
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

def format_docs(retrieved_docs):
    context_text="\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

parllel_chain=RunnableParallel({
    'context':retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

parser = StrOutputParser()

rag_chain = parllel_chain | prompt | llm | parser


Query the rag bot with a question based on the CSV data

In [30]:
answer=rag_chain.invoke('which company does sheryl Baxter work for?')

In [31]:
answer

'Sheryl Baxter works for Rasmussen Group.'