In [20]:
# installing modules
!pip install openai python-dotenv langchain-community langchain scikit-learn
# python-dotevn to load .env files
# by default .env file will be loaded



In [25]:
# importing module
from openai import OpenAI
from dotenv import load_dotenv
from langchain.prompts import  PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

In [22]:
# loading .env file
load_dotenv() # auto load .env file

True

In [23]:
client = OpenAI()

response = client.responses.create(
    model="gpt-5",
    input="hello how are you ?"
)

print(response.output_text)

Hi! I’m doing well, thanks for asking. How are you? What can I help you with today?


In [14]:
# prompt template example
myprompt = "tell me a jobe about {topic}"

mytemplate = PromptTemplate(
    input_variables=["topic"],
    template=myprompt
)

# calling
my_final_prompt = mytemplate.format(topic="donkey")
print(my_final_prompt)

tell me a jobe about donkey


In [15]:
# method 1 using openai module
response1 = client.responses.create(
    model="gpt-5",
    input=my_final_prompt
)

print(response1.output_text)

What do you call a donkey with one leg? A wonky donkey.


In [17]:
# method 2 using langchain_community
llm=ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.5
)

  llm=ChatOpenAI(


In [19]:
# invoke llm
response2 = llm.invoke(my_final_prompt)
print(response2.content)

Donkey caretaker: A donkey caretaker is responsible for the daily care and well-being of donkeys. This includes feeding, grooming, and exercising the animals, as well as monitoring their health and providing any necessary medical care. Donkey caretakers may also be responsible for cleaning and maintaining the animals' living quarters, such as a barn or pasture. Additionally, they may interact with visitors and educate them about donkeys and their care. Overall, a donkey caretaker plays a crucial role in ensuring the happiness and health of these gentle animals.


In [26]:
# semantic similarity and cosine distance in vector
# define embedding
# Step 2: Function to get embeddings (updated for v1.0+)
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=[text],
        model=model
    )
    return np.array(response.data[0].embedding)

In [27]:
# take text examples to do embedding
# Step 3: Example texts
text1 = "I love programming in Python."
text2 = "Python coding is my passion."
text3 = "I enjoy playing football."

In [28]:
# do embedding of above texts

embedding1 = get_embedding(text1)
embedding2 = get_embedding(text2)
embedding3 = get_embedding(text3)

In [29]:
# using cosine distance find similarty (-1,0,+1)-- more higher --> more similar
# using cosine_similarity module
similarity1 = cosine_similarity([embedding1], [embedding2])
similarity2 = cosine_similarity([embedding1], [embedding3])
similarity3 = cosine_similarity([embedding2], [embedding3])


In [30]:
#priting similarty
print("Similarity between text1 and text2:", similarity1[0][0])
print("Similarity between text1 and text3:", similarity2[0][0])
print("Similarity between text2 and text3:", similarity3[0][0])


Similarity between text1 and text2: 0.6387135524202222
Similarity between text1 and text3: 0.26782892926643365
Similarity between text2 and text3: 0.26047997986852434


In [None]:
# Implementing RAG with openai GPT model


# Implementing RAG with openai GPT model


In [31]:
# installing required modules
!pip install pandas  openai python-dotenv langchain-community langchain scikit-learn faiss-cpu numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [47]:
# importing required modules
from openai import OpenAI
from dotenv import load_dotenv
from langchain.prompts import  PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from sklearn.metrics.pairwise import cosine_similarity
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough


import numpy as np
import pandas as pd
import os

In [34]:
# reading the file
df=pd.read_csv('user.csv')
print(df.head())
print(f"Loaded {len(df)} employee records")


   id            name                email   department           position  \
0   1        John Doe     john@example.com  Engineering  Software Engineer   
1   2      Jane Smith     jane@example.com    Marketing  Marketing Manager   
2   3  Robert Johnson   robert@example.com           HR      HR Specialist   
3   4     Emily Davis    emily@example.com      Finance  Financial Analyst   
4   5   Michael Brown  michael@example.com  Engineering   Senior Developer   

   salary   hire_date  
0   95000  2020-05-15  
1   85000  2019-11-20  
2   75000  2021-03-10  
3   90000  2018-07-22  
4  110000  2017-09-05  
Loaded 10 employee records


In [36]:
# Step 2: Convert DataFrame to LangChain Documents
loader = DataFrameLoader(df, page_content_column="name")  # Using 'name' as main content
docs = loader.load()

In [38]:
# before embedding tokenization splitdata into smaller chunks then --
# Step 3: Split into chunks (though small, we'll treat each row as a chunk)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
chunks = text_splitter.split_documents(docs)


In [41]:
# lets use openAI text embedding to convert data into vector format
# Step 5: Create embeddings with explicit API key
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY")  # or use the direct variable
)



  embeddings = OpenAIEmbeddings(


In [43]:
# creating vector store
# Step 6: Create vector store
vector_db = FAISS.from_documents(chunks, embeddings)


In [45]:
# Step 7: Define RAG prompt
template = """Answer the question based only on the following employee database context:
{context}

Question: {question}

Format your answer with these details:
- Name: [full name]
- Email: [email]
- Department: [department]
- Position: [position]
- Salary: [salary]
- Hire Date: [hire_date]

If multiple employees match, list them all."""

prompt = ChatPromptTemplate.from_template(template)


In [46]:
# use any GPT LLM with apiKey to use RAG
chat_model = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.5,
    openai_api_key=os.getenv("OPENAI_API_KEY")  # or use the direct variable
)

In [48]:
# creating RAG chain to use LLM + prompt
# Step 9: Create RAG chain
rag_chain = (
    {"context": vector_db.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 matches
    "question": RunnablePassthrough()
} | prompt | chat_model)




In [49]:
# final lets try query using rag_chain
response2 = rag_chain.invoke("print all the employees positions")
print(response2.content)


- Name: John Doe
- Email: john@example.com
- Department: Engineering
- Position: Software Engineer
- Salary: 95000
- Hire Date: 2020-05-15

- Name: Jennifer Clark
- Email: jennifer@example.com
- Department: Finance
- Position: Accountant
- Salary: 85000
- Hire Date: 2019-04-18

- Name: David Lee
- Email: david@example.com
- Department: IT
- Position: System Administrator
- Salary: 80000
- Hire Date: 2020-08-14
