In [1]:
#install modules
!pip install openai langchain-community langchain python-dotenv pandas numpy faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Dow

In [16]:
# importing modules
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# FAISS is oSS so autosupported by langchain
import os
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough



In [4]:
# loading ENV
load_dotenv()
# testing api KEy
client = OpenAI()

In [5]:
# reading data from csv
# loading csv file
df=pd.read_csv('user.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          10 non-null     int64 
 1   name        10 non-null     object
 2   email       10 non-null     object
 3   department  10 non-null     object
 4   position    10 non-null     object
 5   salary      10 non-null     int64 
 6   hire_date   10 non-null     object
dtypes: int64(2), object(5)
memory usage: 692.0+ bytes


Unnamed: 0,id,name,email,department,position,salary,hire_date
0,1,John Doe,john1@example.com,Engineering,Software Engineer,95000,2020-05-15
1,2,Jane Smith,jane@example.com,Marketing,Marketing Manager,85000,2019-11-20
2,3,Robert Johnson,robert@example.com,HR,HR Specialist,75000,2021-03-10


In [6]:
# converting dataframe to Langchain document
loader = DataFrameLoader(df, page_content_column="name")
data = loader.load()

In [7]:
# Step 3: Split into chunks (though small, we'll treat each row as a chunk)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
chunks = text_splitter.split_documents(data)


In [8]:
# Step 5: Create embeddings with explicit API key
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY")  # or use the direct variable
)

  embeddings = OpenAIEmbeddings(


In [10]:
# creating vectorDb to store vector db in FAISS
vector_db = FAISS.from_documents(chunks, embeddings)

In [13]:
# define rag prompt template
# Step 7: Define RAG prompt
template = """Answer the question based only on the following employee database context:
{context}

Question: {question}

Format your answer with these details:
- Name: [full name]
- Email: [email]
- Department: [department]
- Position: [position]
- Salary: [salary]
- Hire Date: [hire_date]

If multiple employees match, list them all."""

prompt = ChatPromptTemplate.from_template(template)

In [15]:
# calling any LLM model using chatopenai
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)


  llm = ChatOpenAI(


In [24]:
# create RAG chain to call prompt template  + LLM
# Step 9: Create RAG chain
rag_chain = (
    {"context": vector_db.as_retriever(search_kwargs={"k": 11}),  # Retrieve top 3 matches
    "question": RunnablePassthrough()
} | prompt | llm)

In [31]:
# we can start conversation with LLM which can give info from RAG
response = rag_chain.invoke("provide only top 5 salaries figures")
print(response.content)

- Name: Michael Brown
- Email: michael@example.com
- Department: Engineering
- Position: Senior Developer
- Salary: 110000
- Hire Date: 2017-09-05

- Name: Thomas Moore
- Email: thomas@example.com
- Department: Engineering
- Position: DevOps Engineer
- Salary: 105000
- Hire Date: 2021-06-25

- Name: Emily Davis
- Email: emily@example.com
- Department: Finance
- Position: Financial Analyst
- Salary: 90000
- Hire Date: 2018-07-22

- Name: John Doe
- Email: john1@example.com
- Department: Engineering
- Position: Software Engineer
- Salary: 95000
- Hire Date: 2020-05-15

- Name: Jennifer Clark
- Email: jennifer@example.com
- Department: Finance
- Position: Accountant
- Salary: 85000
- Hire Date: 2019-04-18
