In [1]:
# !pip install --upgrade langchain

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Downloading langchain-0.0.189-py3-none-any.whl (975 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.6/975.6 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m[31m7.1 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.188
    Uninstalling langchain-0.0.188:
      Successfully uninstalled langchain-0.0.188
Successfully installed langchain-0.0.189


In [2]:
%run ./01-basic.ipynb

In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown

In [4]:
# DocArrayInMemorySearch -- search is done in memory.

In [5]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

In [6]:
from langchain.indexes import VectorstoreIndexCreator

In [8]:
# !pip install docarray

In [9]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [10]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

In [14]:
response = index.query(query)

# Note
Language models can handle only short documents, small number of tokens.
Embeddings: a numberical representation of the piece of text, that refers to the semantics/meanings.
LLM can create an embedding for any document or query. 

An embedding is created for each chunk of the document as you cannot pass the whole document to the LLM.
The index consists of the embeddings of all the chunks. 


In [12]:
display(Markdown(response))



| Name | Description |
| --- | --- |
| Men's Tropical Plaid Short-Sleeve Shirt | UPF 50+ rated, 100% polyester, wrinkle-resistant, front and back cape venting, two front bellows pockets |
| Men's Plaid Tropic Shirt, Short-Sleeve | UPF 50+ rated, 52% polyester and 48% nylon, machine washable and dryable, front and back cape venting, two front bellows pockets |
| Men's TropicVibe Shirt, Short-Sleeve | UPF 50+ rated, 71% Nylon, 29% Polyester, 100% Polyester knit mesh, machine wash and dry, front and back cape venting, two front bellows pockets |
| Sun Shield Shirt by | UPF 50+ rated, 78% nylon, 22% Lycra Xtra Life fiber, handwash, line dry, wicks moisture, fits comfortably over swimsuit, abrasion resistant |

All four shirts provide UPF 50+ sun protection, blocking 98% of the sun's harmful rays. The Men's Tropical Plaid Short-Sleeve Shirt is made of 100% polyester and is wrinkle-resistant

In [15]:
loader = CSVLoader(file_path=file)

In [16]:
docs = loader.load()

In [17]:
docs[0]


Document(page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 0})

In [18]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [19]:
# As an example, try embedding a query using the 'embeddings' object
embed = embeddings.embed_query("Hi my name is Harrison")

In [21]:
print(len(embed))

1536


In [22]:
print(embed[:10])

[-0.021900920197367668, 0.006746490020304918, -0.018175246194005013, -0.039119575172662735, -0.014097143895924091, 0.016891399398446083, 0.002292359247803688, -0.008175084367394447, 0.012901403941214085, 9.587552631273866e-05]


In [23]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [24]:
"""
db is now the entire vector store that takes a lsit of documents, and then the embeddings from the LLM
"""

'\ndb is now the entire vector store that takes the document chunks, and then the embeddings from the LLM\n'

In [25]:
query = "Please suggest a shirt with sunblocking"

In [26]:
docs = db.similarity_search(query)

# Note
similarity_search - cosine similarity,  

Cosine similarity is a metric used to measure the similarity between two vectors in a multi-dimensional space. It determines the cosine of the angle between the vectors, hence the name "cosine similarity."

To calculate the cosine similarity between two vectors A and B, the formula is:

cosine_similarity(A, B) = (A . B) / (||A|| * ||B||)

Here, (A . B) represents the dot product of vectors A and B, and ||A|| and ||B|| represent the Euclidean norms (also known as the lengths or magnitudes) of vectors A and B, respectively.

The resulting value of cosine similarity ranges from -1 to 1. A value of 1 indicates that the vectors are in the same direction, meaning they are identical. A value of -1 indicates that the vectors are in opposite directions, and a value of 0 indicates that the vectors are orthogonal or completely dissimilar.

Cosine similarity is commonly used in various fields, including information retrieval, natural language processing, and recommendation systems, where it helps to measure the similarity or distance between textual documents or high-dimensional feature vectors.



# Note
docs is small enough and we don't need to chunk it. 
similarity_search - through all the vectors.. 
Each element of the docs array is one product. 

In [29]:
docs[0]

Document(page_content=': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.\n\nSun Protection That Won\'t Wear Off\nOur high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun\'s harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.', metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 255})

In [30]:
len(docs)

4

In [31]:
retriever = db.as_retriever()

In [32]:
# Retriever is an interface, and can be implemented by any LLM call.

In [33]:
llm = ChatOpenAI(temperature = 0.0)


In [34]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [46]:
qdocs

': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.\n\nSun Protection That Won\'t Wear Off\nOur high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun\'s harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.: 374\nname: Men\'s Plaid Tropic Shirt, Short-Sleeve\ndescription: Our Ultracomfortable sun protection is rated to UPF 50+, helping you stay cool and dry. Originally designed for fishing, this lightest hot-weather shirt offers UPF 50+ c

In [35]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 


In [36]:
display(Markdown(response))

| Name | Description |
| --- | --- |
| Sun Shield Shirt | High-performance sun shirt with UPF 50+ sun protection, moisture-wicking, and abrasion-resistant fabric. Fits comfortably over swimsuits. Recommended by The Skin Cancer Foundation. |
| Men's Plaid Tropic Shirt | Ultracomfortable shirt with UPF 50+ sun protection, wrinkle-free fabric, and front/back cape venting. Made with 52% polyester and 48% nylon. |
| Men's TropicVibe Shirt | Men's sun-protection shirt with built-in UPF 50+ and front/back cape venting. Wrinkle-resistant and made with 71% nylon and 29% polyester. |
| Men's Tropical Plaid Short-Sleeve Shirt | Lightest hot-weather shirt with UPF 50+ sun protection, front/back cape venting, and two front bellows pockets. Made with 100% polyester and wrinkle-resistant. |

All of these shirts provide UPF 50+ sun protection, blocking 98% of the sun's harmful rays. They are made with high-performance fabrics that are moisture-wicking, wrinkle-resistant, and abrasion-resistant. The Men's Plaid Tropic Shirt and Men's Tropical Plaid Short-Sleeve Shirt have front/back cape venting for added breathability. The Sun Shield Shirt is recommended by The Skin Cancer Foundation.

# Note
The above steps of finding similar documents, creating the qdocs context, and then callingthe llm can be simplified with the retrievers as shown below. 
'retriever' includes the entire vector store, and will create the embedding for the query, find similar documents, create the context. 'run' method
then passes the query with the context.  Somewhat simplified 


In [37]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [38]:
query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [39]:
response = qa_stuff.run(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [42]:
display(Markdown(response))

| Shirt ID | Name | Description |
| --- | --- | --- |
| 618 | Men's Tropical Plaid Short-Sleeve Shirt | Rated UPF 50+ for superior protection from the sun's UV rays. Made of 100% polyester and is wrinkle-resistant. With front and back cape venting that lets in cool breezes and two front bellows pockets. |
| 374 | Men's Plaid Tropic Shirt, Short-Sleeve | Rated to UPF 50+ and offers sun protection. Made with 52% polyester and 48% nylon, this shirt is machine washable and dryable. Additional features include front and back cape venting, two front bellows pockets. |
| 535 | Men's TropicVibe Shirt, Short-Sleeve | Built-in UPF 50+ sun-protection shirt with the lightweight feel. Made with 71% Nylon, 29% Polyester. Wrinkle-resistant with front and back cape venting that lets in cool breezes and two front bellows pockets. |
| 255 | Sun Shield Shirt | High-performance sun shirt that protects from harmful UV rays. Made with 78% nylon, 22% Lycra Xtra Life fiber. Wicks moisture for quick-drying comfort and fits comfortably over your favorite swimsuit. |

All of the shirts listed above provide sun protection with a UPF rating of 50+ and block 98% of the sun's harmful rays. They are all designed to be lightweight and comfortable in hot weather, with features such as front and back cape venting and wrinkle-resistant fabric. The Men's Tropical Plaid Short-Sleeve Shirt and Men's TropicVibe Shirt have a traditional fit, while the Men's Plaid Tropic Shirt and Sun Shield Shirt have a slightly fitted shape.

In [41]:
response = index.query(query, llm=llm)

In [43]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

In [44]:
# Here we customize the index by using a different type of vector store. 

# stuffing - all data into the prompt as context. Cons: context length is limited.

In [45]:
"""
other methods are:-
- map_reduce - get a number of responses, and then summarize all responses by having another call.
- refine - loop over the chunks, but pass the previous response to the new call.. sequential as a call depends on the previous
- map_rerank - single call for each document and ask to return a score,
"""

'\nother methods are:-\n- map_reduce - get a number of responses, and then summarize all responses by having another call.\n- refine - loop over the chunks, but pass the previous response to the new call.. sequential as a call depends on the previous\n- map_rerank - single call for each document and ask to return a score,\n'