# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [1]:
#pip install --upgrade langchain

In [2]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [3]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [4]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader, PyPDFLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.llms import OpenAI

In [6]:
file = 'My_Movie_Dataset.csv'
loader = CSVLoader(file_path=file)

In [10]:
pip install pypdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pypdf
  Obtaining dependency information for pypdf from https://files.pythonhosted.org/packages/b8/1e/071b6684ee2b299a74a0bcdbf9a5441a1002920c72b6990b445d45c2b956/pypdf-4.1.0-py3-none-any.whl.metadata
  Downloading pypdf-4.1.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.1.0-py3-none-any.whl (286 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from langchain.indexes import VectorstoreIndexCreator

In [8]:
#pip install docarray

In [9]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [10]:
query ="Please list all the movies that belong to the drama genre."

**Note**:
- The notebook uses `langchain==0.0.179` and `openai==0.27.7`
- For these library versions, `VectorstoreIndexCreator` uses `text-davinci-003` as the base model, which has been deprecated since 1 January 2024.
- The replacement model, `gpt-3.5-turbo-instruct` will be used instead for the `query`.
- The `response` format might be different than the video because of this replacement model.

In [11]:
llm_replacement_model = OpenAI(temperature=0, 
                               model='gpt-3.5-turbo-instruct')

response = index.query(query, 
                       llm = llm_replacement_model)

In [12]:
display(Markdown(response))

 Showgirls (1995), Total Eclipse (1995), Cry, the Beloved Country (1995), Piano, The (1993)

## Step By Step

In [13]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [14]:
docs = loader.load()

In [15]:
docs[0]

Document(page_content='\ufeffmovieId: 1\ntitle: Toy Story (1995)\ngenres: Adventure|Animation|Children|Comedy|Fantasy', metadata={'source': 'My_Movie_Dataset.csv', 'row': 0})

In [16]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [17]:
embed = embeddings.embed_query("Hi my name is Sarthak Pattnaik")

In [18]:
print(len(embed))

1536


In [19]:
print(embed[:5])

[-0.0059837764129042625, -0.005321728065609932, -0.0072160097770392895, -0.0014959441032260656, -0.023048149421811104]


In [20]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [21]:
query = "Please suggest a movie in the fantasy genre"

In [22]:
docs = db.similarity_search(query)

In [23]:
len(docs)

4

In [24]:
docs[0]

Document(page_content='\ufeffmovieId: 653\ntitle: Dragonheart (1996)\ngenres: Action|Adventure|Fantasy', metadata={'source': 'My_Movie_Dataset.csv', 'row': 549})

In [25]:
retriever = db.as_retriever()

In [26]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)

In [27]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [28]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all the \
films that belong to the action genre in a table in markdown.")


In [29]:
display(Markdown(response))

| movieId | title                | genres                            |
|---------|----------------------|-----------------------------------|
| 533     | Shadow, The (1994)   | Action\|Adventure\|Fantasy\|Mystery |

In [30]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [31]:
query =  "Please list all movies in the romance genre \
in markdown."

In [32]:
response = qa_stuff.run(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [33]:
display(Markdown(response))

Here are all the movies in the romance genre:

- Mad Love (1995)
- Total Eclipse (1995)
- Love Affair (1994)

Note: True Romance (1993) is not included in this list as it is categorized as a Crime/Thriller movie.

In [34]:
response = index.query(query, llm=llm)

In [35]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

Reminder: Download your notebook to you local computer to save your work.

In [36]:
response

'Here are all the movies in the romance genre:\n\n- Mad Love (1995)\n- Total Eclipse (1995)\n- Love Affair (1994)\n\nNote: True Romance (1993) is not included in this list as it is categorized as a Crime/Thriller movie.'