In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pathlib import Path
from dotenv import load_dotenv
import os

In [2]:
_=load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [3]:
llm = ChatOpenAI(model='gpt-3.5-turbo-0125')

In [9]:
# Download the PDF document and CSV file used in this notebook
import urllib.request
import os

os.makedirs('data', exist_ok=True)

# Download files using Python's urllib
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf',
    'data/Understanding_Climate_Change.pdf'
)
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/customers-100.csv',
    'data/customers-100.csv'
)


('data/customers-100.csv', <http.client.HTTPMessage at 0x13297720050>)

In [12]:
import pandas as pd

data = pd.read_csv('data/customers-100.csv')
data.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


In [13]:
loader = CSVLoader(file_path='data/customers-100.csv')
docs = loader.load()

In [15]:
docs[0]

Document(metadata={'source': 'data/customers-100.csv', 'row': 0}, page_content='Index: 1\nCustomer Id: DD37Cf93aecA6Dc\nFirst Name: Sheryl\nLast Name: Baxter\nCompany: Rasmussen Group\nCity: East Leonard\nCountry: Chile\nPhone 1: 229.077.5154\nPhone 2: 397.884.0519x718\nEmail: zunigavanessa@smith.info\nSubscription Date: 2020-08-24\nWebsite: http://www.stephenson.com/')

In [25]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embedding = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=embedding,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [26]:
vector_store.add_documents(documents=docs)

['2d5cc49c-4ee5-4187-99cc-d1c5b4d6749a',
 'b52faa20-1355-455f-a226-410f2077d877',
 '6e826633-2f2a-4c27-a015-6a145041ea1c',
 'a09265b4-8acd-40d3-be94-fbf74f9c31e8',
 '038e5082-fca5-4215-842a-733d2c30001e',
 '31cd2d07-ee71-4089-bba2-292e851cc8a3',
 'd34c2100-6542-40d2-acc4-e7bb1fc7a510',
 '595c6d53-d6fb-4dd8-81f1-edb2fe68e81c',
 '3ef3293e-df70-4ae4-92ba-cd1a9f6d095c',
 '9a50a01d-8071-4674-b5b7-17ec8f20062a',
 'b1412a72-ea36-4e17-a0f1-3eda491cdfbc',
 '4c4de4a4-3f02-49d2-9f64-ab9bfc97bfa1',
 'c5a66249-9cd3-488d-8aaa-46dde7412e89',
 'f24697d2-fa0f-4bf7-a33c-f1bdde5b5837',
 '8a21475e-1b2f-4e60-afa1-599c38b804aa',
 '693ba9c7-f371-4e65-a417-9978ac651636',
 '997a4c9e-186e-4040-bd94-d8c22a4a03c4',
 'c3ba0026-7cab-4c8e-9b8a-17f5bc497737',
 '10a4ba86-dd12-4d31-8479-e2b2daebdcc7',
 '1b0b32ca-70ba-4221-bd33-f8ad2475e6f7',
 '8cd677ba-6fa3-4442-8be7-00fe42b0fbca',
 '54171cfe-6f7c-453b-98df-f797589f8127',
 'f08f270a-a2e4-4b3c-b8f8-b27d54ffbecc',
 '0fb34a9d-714f-44d8-a8a9-4f87ffbfeb91',
 'c3e60aa5-57e6-

In [27]:
retriever = vector_store.as_retriever()

In [28]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [29]:
# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [30]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

In [31]:
# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [32]:
answer= rag_chain.invoke({"input": "which company does sheryl Baxter work for?"})
answer['answer']

'Sheryl Baxter works for the Rasmussen Group.'

In [33]:
answer = rag_chain.invoke({"input": "Which city Roy	Berry is living"})

In [34]:
answer

{'input': 'Which city Roy\tBerry is living',
 'context': [Document(id='6e826633-2f2a-4c27-a015-6a145041ea1c', metadata={'source': 'data/customers-100.csv', 'row': 2}, page_content='Index: 3\nCustomer Id: 6F94879bDAfE5a6\nFirst Name: Roy\nLast Name: Berry\nCompany: Murillo-Perry\nCity: Isabelborough\nCountry: Antigua and Barbuda\nPhone 1: +1-539-402-0259\nPhone 2: (496)978-3969x58947\nEmail: beckycarr@hogan.com\nSubscription Date: 2020-03-25\nWebsite: http://www.lawrence.com/'),
  Document(id='b26d07fc-0c74-4bff-9558-e7fd226afecf', metadata={'source': 'data/customers-100.csv', 'row': 71}, page_content='Index: 72\nCustomer Id: Ef859092FbEcC07\nFirst Name: Richard\nLast Name: Roth\nCompany: Conway-Mcbride\nCity: New Jasmineshire\nCountry: Morocco\nPhone 1: 581-440-6539\nPhone 2: 9857827463\nEmail: aharper@maddox-townsend.org\nSubscription Date: 2020-02-23\nWebsite: https://www.brooks.com/'),
  Document(id='54171cfe-6f7c-453b-98df-f797589f8127', metadata={'source': 'data/customers-100.csv'