In [11]:
from openai import OpenAI
import pandas as pd
from dotenv import load_dotenv
import numpy as np
import concurrent.futures
from scipy.spatial.distance import cosine, euclidean, cityblock

In [2]:
load_dotenv()
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI()

In [4]:
df = pd.read_csv("Reviews_1k.csv")

In [5]:
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
1,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [6]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [138]:
# An example of how sqrt chunking works
nrow = 167
sqrt_chunk_n = int(np.sqrt(nrow)) 
chunk_size = int(nrow/sqrt_chunk_n)
lst = [list(range(i,min(i+chunk_size,nrow))) for i in range(0,nrow,chunk_size)]
print("sqrt_chunk_n: ", sqrt_chunk_n)
print("chunk_size: ", chunk_size)
lst

sqrt_chunk_n:  12
chunk_size:  13


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38],
 [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51],
 [52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64],
 [65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77],
 [78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90],
 [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103],
 [104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116],
 [117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
 [130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142],
 [143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155],
 [156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166]]

In [139]:
# Chunking the documents
nrow = df.shape[0]
sqrt_chunk_n = int(np.sqrt(nrow)) 
chunk_size = int(nrow/sqrt_chunk_n)
[list(df.combined[i:i+chunk_size]) for i in range(0,nrow,chunk_size)]
lst = [list(df.combined[i:min(i+chunk_size,nrow)]) for i in range(0,nrow,chunk_size)]
print("sqrt_chunk_n: ", sqrt_chunk_n)
print("chunk_size: ", chunk_size)

sqrt_chunk_n:  31
chunk_size:  32


In [141]:
# define a function that process a list of strs
def process_chunk(chunk):
    embeddings = []
    for text in chunk:
        embedding = get_embedding(text)
        embeddings.append(embedding)
    return embeddings

In [144]:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_chunk, chunk) for chunk in lst]
    embeddings = []
    for future in concurrent.futures.as_completed(futures):
        embeddings.extend(future.result())
df['embedding'] = embeddings
df.to_csv('parallel_embeddings.csv', index=False)

In [145]:
df.embedding[0]

[0.02088439092040062,
 -0.0002246303338324651,
 -0.0019172363681718707,
 -0.017448609694838524,
 0.014939199201762676,
 -0.021939750760793686,
 -0.013109909370541573,
 0.05281487852334976,
 0.034123290330171585,
 -0.0180349200963974,
 0.07636111974716187,
 -0.04174533113837242,
 -0.01960623450577259,
 -0.002708755899220705,
 -0.011679311282932758,
 0.04312902316451073,
 -0.02586803399026394,
 0.01561931986361742,
 -0.04538045823574066,
 0.0031162418890744448,
 0.05150154232978821,
 -0.0053061130456626415,
 -0.04831201210618019,
 0.03801639378070831,
 -0.0098089799284935,
 -0.005185919348150492,
 -0.022256359457969666,
 0.02943280339241028,
 0.041252829134464264,
 -0.0318014994263649,
 -0.030558519065380096,
 -0.01693265698850155,
 -0.0009490906377322972,
 -0.007539957296103239,
 -0.024320172145962715,
 0.003558906726539135,
 0.025117555633187294,
 -0.04296485707163811,
 0.048687249422073364,
 -0.03560079261660576,
 -0.03295066952705383,
 0.03330245614051819,
 0.03377150371670723,
 0.01

In [None]:
df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x))
df.to_csv('embedded_1k_reviews.csv', index=False)
#df = pd.read_csv('embedded_1k_reviews.csv')
#df['ada_embedding'] = df.ada_embedding.apply(eval).apply(np.array)

In [17]:
df.ada_embedding.head(3)

0    [0.02088439092040062, -0.0002246303338324651, ...
1    [-0.004459121264517307, 0.0007839715690352023,...
2    [0.019770847633481026, -0.03219044208526611, -...
Name: ada_embedding, dtype: object

In [None]:
EMBEDDING_MODEL = "text-embedding-3-small"

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["combined"], relatedness_fn(query_embedding, row["ada_embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [33]:
str_to_search = "what flover of pizza do people most like?"

strings, relatednesses = strings_ranked_by_relatedness(str_to_search, df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.361


'Title: Music to My Palate; Content: We like Plockys<br />They like us.<br />Plockys Plockys.<br />Mean no fuss.<br />Taste so good.<br />And healthy too.<br />Plockys Plockys.<br />Grab a bag or two.<br />Thank you Plockys!!'

relatedness=0.359


'Title: Best sauce around; Content: I was browsing around on Amazon looking for gifts and was very pleased to see my favorite sauce on here. I will just order and have shipped to my friends. Habanero is my favorite, I love the heat and the tangy spiciness. I also love the pineapple. You will not go wrong ordering this up.'

relatedness=0.353


'Title: Looking for a different flavor?; Content: I order these olives a lot. They are un-like any other olives out there. The subtle flavors of the brine go great with cheeses. One of my favorite appetizer treats for guests.'

relatedness=0.327


"Title: Best ever latice tart; Content: I ordered two of these and two of raspberry latice tarts directly from FantasiCakes website for a dinner party I was hosting. It arrived fresh and intact. Very good size. I froze half for later use. I am a pastry lover and these were the best I've ever tasted. The pastry was soft, the jam was really good and the taste was great. They were gone in no time. My guests were really impressed."

relatedness=0.324


"Title: TART!; Content: The crust on these tarts are perfect.  My husband loves these, but I'm not so crazy about them.  They are just too sour/tart for my taste.  I'll eat the crust and hubby takes my filling.  My kids think they're great, so maybe it's just me."