In [16]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine, euclidean, cityblock
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
EMBEDDING_MODEL = "text-embedding-3-small"
client = OpenAI()

In [7]:
df = pd.read_csv("embeddings_5k.csv")
df["embedding"] = df.embedding.apply(eval).apply(np.array, dtype="f")
df

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined,embedding
0,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...,"[0.02088439, -0.00022463033, -0.0019172364, -0..."
1,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...,"[-0.0044591213, 0.00078397157, -0.022424141, 0..."
2,1219017600,B000LQOCH0,ABXLMWJIXXAIN,4,"""Delight"" says it all",This is a confection that has been around a fe...,"Title: ""Delight"" says it all; Content: This is...","[0.019770848, -0.032190442, -0.06242821, -0.00..."
3,1307923200,B000UA0QIQ,A395BORC6FGVXV,2,Cough Medicine,If you are looking for the secret ingredient i...,Title: Cough Medicine; Content: If you are loo...,"[-0.013130724, -0.021559557, -0.0760047, 0.023..."
4,1350777600,B006K2ZZ7K,A1UQRSCLF8GW1T,5,Great taffy,Great taffy at a great price. There was a wid...,Title: Great taffy; Content: Great taffy at a ...,"[0.00579996, -0.051506482, -0.0690192, 0.01586..."
...,...,...,...,...,...,...,...,...
4995,1341273600,B00653KJ5M,AGE64SY3HP9RV,2,The cavemen must have been wealthy,I really wanted to like these.<br /><br />Firs...,Title: The cavemen must have been wealthy; Con...,"[0.006345336, 0.004690341, -0.050477345, 0.017..."
4996,1341100800,B00653KJ5M,A1MPEPNJ2NKWM,1,These cookies need work; you can make your own...,I was not impressed with these cookies when I ...,Title: These cookies need work; you can make y...,"[0.01030105, -0.02405074, -0.011326668, 0.0159..."
4997,1336694400,B00653KJ5M,A1T6I625FGQS20,3,Okay in a pinch - not great,The cookies came sealed and seem to be high qu...,Title: Okay in a pinch - not great; Content: T...,"[-0.017131463, -0.017847527, -0.04590908, 0.01..."
4998,1342137600,B00653KJ5M,A3POAWC2JPQQQP,4,"they are good (except for the ""rainforest"" fla...","These taste very good, but aren't like the BES...","Title: they are good (except for the ""rainfore...","[0.020078687, -0.039267983, -0.028810894, -0.0..."


In [18]:
# search function
def find_kNN(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["combined"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

### Testing
Below are some questions that might related with Amazon fine food dataset that might being asked by users. We'll use the 5k vector database to test.

In [24]:
str_to_search = "What is the most popular type of pasta?"

strings, relatednesses = find_kNN(str_to_search, df, top_n=3)
for i, (string, relatedness) in enumerate(zip(strings, relatednesses), start=1):
    print(f"Result {i}:")
    print(f"Relatedness: {relatedness:.3f}")
    print(f"Content: {string}\n")
    print('-'*50)


Result 1:
Relatedness: 0.494
Content: Title: Superb Pasta!; Content: *****<br /><br />Oh, I love this pasta! It is thick, chewy, absolutely extraordinary! And made with authentic flavorings to boot! The red are flavored with tomato and red beet powder. The green are flavored with spinach powder and basil powder. The black pastas are flavored with squid ink, the off-white are au naturale. The yellow pasta are flavored with turmeric, the orange with carrot powder. A rainbow of beauty and natural delectable flavor.<br /><br />The pasta are shaped like little cresent moons with a rough exterior to absorb whatever sauce or oil you might use. Because this pasta is so very flavorful, I enjoy it best with a good quality olive oil and some Celtic sea salt. Sometimes I add some tarragon or fresh rosemary, usually not, so I can just savor the pasta alone. It also is excellent with olive oil and fresh grated Parmesan chees on top.<br /><br />With only 1g of fat, 10g of sodium, this can also be a h

In [25]:
str_to_search = "Which brand of chocolate do customers prefer?"

strings, relatednesses = find_kNN(str_to_search, df, top_n=3)
for i, (string, relatedness) in enumerate(zip(strings, relatednesses), start=1):
    print(f"Result {i}:")
    print(f"Relatedness: {relatedness:.3f}")
    print(f"Content: {string}\n")
    print('-'*50)


Result 1:
Relatedness: 0.595
Content: Title: Dark chocolate cocoa; Content: This seems to be the best brand that I have tried.  Others have been very weak in flavor, but not htis one.  I would truly recommend this for Moms who have small children and need the hot chocolate after playing in the snow.  Grab the marshmellows and enjoy!  Grammie Smith (6 grandkids)

--------------------------------------------------
Result 2:
Relatedness: 0.577
Content: Title: There is none greater.; Content: The best chocolate in the world, in this critic's humble opinion, is made in the United States. And the best chocolate in the United States is made in California, by the Ghirardelli Chocolate Company. It is surely the best chocolate that one can find in so many stores, malls, etc. throughout the United States. Their actual stores are rare, but Kroger, Wal-Mart, Martin's (formerly Ukrops) and a long list of other grocery stores and many malls stock their products.<br /><br />I have never encountered mi

In [26]:
str_to_search = "What do customers say about gluten-free products?"

strings, relatednesses = find_kNN(str_to_search, df, top_n=3)
for i, (string, relatedness) in enumerate(zip(strings, relatednesses), start=1):
    print(f"Result {i}:")
    print(f"Relatedness: {relatedness:.3f}")
    print(f"Content: {string}\n")
    print('-'*50)


Result 1:
Relatedness: 0.610
Content: Title: Gluten free isn't  always taste free.; Content: Due to health issues, my husband is newly required to eat a gluten free diet. He acted as if he was on death row. Finding this product has help immensely. I can make him pancakes with fresh fruit and he doesn't feel he is being unfairly punished. Also, my neighbor was faced with a similar circumstance. I shared one of the boxes with her and she is now a fan! <a href="http://www.amazon.com/gp/product/B004391DK0">Bisquick Pancake and Baking Mix, Gluten-Free, 16-Ounce Boxes (Pack of 3)</a>

--------------------------------------------------
Result 2:
Relatedness: 0.604
Content: Title: Great product; Content: I have to eat gluten free, but my family doesn't.  This is the only pasta so far that my kids can't see a real difference.  They like it a lot.  This is one I fix when I don't have time or energy to fix separate things for them and myself.  Because it tastes yummy for all of us.

-------------

In [27]:
str_to_search = "How do customers feel about organic food?"

strings, relatednesses = find_kNN(str_to_search, df, top_n=3)
for i, (string, relatedness) in enumerate(zip(strings, relatednesses), start=1):
    print(f"Result {i}:")
    print(f"Relatedness: {relatedness:.3f}")
    print(f"Content: {string}\n")
    print('-'*50)


Result 1:
Relatedness: 0.513
Content: Title: I agree with the previous reviewer, the name is deceptive.; Content: When one purchases a product that is called Newman's Organics, one would expect organic food.  Not just the vegetables, which are much less expensive to be organic than the meat or chicken.  I was very, very disappointed to find out that the chicken is not organic.  For a couple of dollars more, just buy real organic dog food. This dog food isn't exactly cheap, so you expect that they will provide what they are advertising.  This whole "greenwashing" thing needs to be regulated better.  If someone uses the word organic in their description, it better be organic.  Otherwise the consumer is being deceived which is unfortunate.  Newmans has a great reputation, I buy a lot of their products, but this was intentionally deceptive and it makes me reconsider buying anything from them in the future.<br /><br />On the positive side, my dog loved it, she would lick the bowl around the

In [28]:
str_to_search = "What are the best snacks for children?"

strings, relatednesses = find_kNN(str_to_search, df, top_n=3)
for i, (string, relatedness) in enumerate(zip(strings, relatednesses), start=1):
    print(f"Result {i}:")
    print(f"Relatedness: {relatedness:.3f}")
    print(f"Content: {string}\n")
    print('-'*50)


Result 1:
Relatedness: 0.599
Content: Title: amazing snack! My KIDS love them, too!; Content: Feel like a protein boost?  Got the nibbles?  Don't want to succumb to potato chips?  Here's a great alternative!  Just don't let your toddler try to open the package or your peas will be all over the floor making  for an expensive snack for the dog and a grumpy mommy.

--------------------------------------------------
Result 2:
Relatedness: 0.590
Content: Title: Great Healthy Snack; Content: This is a very good snack that I feel great about offering my 10 month old.  She can easily  pick up the small pieces and definitely enjoys the apple cinnamon taste.  My 5 year old loves them as well!  I would definitely recommend these to anyone with small (or larger) kids.  :)

--------------------------------------------------
Result 3:
Relatedness: 0.584
Content: Title: Great snack; Content: My daughter loves these snacks and so does everyone else. I try to feed my daughter organic stuff and I'm so g