In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
from openai import OpenAI
import pandas as pd
from time import time
import dotenv
dotenv.load_dotenv()

True

In [2]:
token= os.getenv("RUNPOD_TOKEN") 
open_ai_base_url = os.getenv("RUNPOD_EMBEDDING_URL") 
model_name= os.getenv("MODEL_NAME") 
pinecone_api_key = os.getenv("PINECONE_API_KEY") 

In [3]:
pc = Pinecone(api_key=pinecone_api_key)

client = OpenAI(
  api_key=token, 
  base_url=open_ai_base_url
)

# Try out embeddings

In [4]:
output = client.embeddings.create(input = ["helloo there"],model=model_name)
embedings = output.data[0].embedding
print(embedings)

[-0.055366937071084976, -0.056556638330221176, 0.08584163337945938, -0.06238313391804695, 0.01755574159324169, -0.01140131801366806, 0.052316416054964066, 0.05259096249938011, 0.028903676196932793, -0.022573847323656082, -0.013201124966144562, -0.04783215373754501, 0.029452770948410034, 0.031847428530454636, 0.056312598288059235, -0.008991407230496407, 0.012819809839129448, -0.05423824489116669, -0.10365666449069977, -0.019553832709789276, 0.02913246490061283, 0.05204186961054802, -0.02882741391658783, -0.034775927662849426, 0.001531932968646288, -0.00663869408890605, 0.019736863672733307, 0.03556906431913376, -0.008488072082400322, -0.07382258027791977, -0.0005252613918855786, -0.012873194180428982, 0.046581439673900604, -0.00035176306846551597, 0.05137075483798981, 0.0004532882012426853, 0.05960715934634209, -0.025029515847563744, -0.07791028171777725, -0.005586264654994011, 0.060491811484098434, -0.02512103132903576, -0.0016205887077376246, -0.016579575836658478, 0.02251283638179302

In [5]:
len(embedings)

384

# Wrangle dataset

In [6]:
df=pd.read_json('products/products.jsonl',lines=True)

In [7]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


In [8]:
df['text'] =  df['name']+" : "+df['description'] + \
                " -- Ingredients: " + df['ingredients'].astype(str) + \
                " -- Price: " + df['price'].astype(str) + \
                " -- rating: " + df['rating'].astype(str) 

In [9]:
df['text'].head()

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
2    Latte : Smooth and creamy, our latte combines ...
3    Chocolate Chip Biscotti : Crunchy and delightf...
4    Espresso shot : A bold shot of rich espresso, ...
Name: text, dtype: object

In [10]:
texts = df['text'].tolist()

In [11]:
with open('products/Merry\'s_way_about_us.txt') as f:
    Merry_way_about_section = f.read()
    
Merry_way_about_section = "Coffee shop Merry's Way about section: " + Merry_way_about_section
texts.append(Merry_way_about_section)

In [12]:
with open('products/menu_items_text.txt') as f:
    menue_items_text = f.read()
    
menue_items_text = "Menu Items: " + menue_items_text
texts.append(menue_items_text)

# Generate Embeddings

In [13]:
output = client.embeddings.create(input = texts,model=model_name)

In [14]:
embeddings = output.data

# Push data to database

In [15]:
index_name = "coffeeshop"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [16]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0].strip()
    vectors.append({
        "id": entry_id,
        "values": e.embedding,
        "metadata": {'text': text}
    })
    
index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 20}

# Get Closest documents

In [20]:
output = client.embeddings.create(input = ["Is Cappuccino lactose-free?"],model=model_name)
embedding = output.data[0].embedding

In [21]:
results = index.query(
    namespace="ns1",
    vector=embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)


In [22]:
results

{'matches': [{'id': 'Cappuccino',
              'metadata': {'text': 'Cappuccino : A rich and creamy cappuccino '
                                   'made with freshly brewed espresso, steamed '
                                   'milk, and a frothy milk cap. This '
                                   'delightful drink offers a perfect balance '
                                   'of bold coffee flavor and smooth milk, '
                                   'making it an ideal companion for relaxing '
                                   'mornings or lively conversations. -- '
                                   "Ingredients: ['Espresso', 'Steamed Milk', "
                                   "'Milk Foam'] -- Price: 4.5 -- rating: 4.7"},
              'score': 0.734829128,
              'values': []},
             {'id': 'Sugar Free Vanilla syrup',
              'metadata': {'text': 'Sugar Free Vanilla syrup : Enjoy the sweet '
                                   'flavor of vanilla without the 