In [1]:
import pandas as pd
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import os
from pinecone import ServerlessSpec
import time
import uuid
import torch
from groq import Groq
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


openrouter_client = OpenAI(
  api_key = OPENROUTER_API_KEY,
  base_url = "https://openrouter.ai/api/v1",
)

groq_client = Groq(
  api_key=GROQ_API_KEY
)

### Cleaning

In [31]:
df = pd.read_csv("../data/unclean_2.csv", delimiter=';')

In [32]:
df.head()

Unnamed: 0,Name,Country,Ingredients,Instructions,Meal Type,Spice Level,Cooking Time (minutes),Vegetarian,Main Cooking Method,Serving Temperature,How to Make
0,Doro Wat,Ethiopia,"Chicken, onions, garlic, ginger, berbere spice...","In a pot, sauté onions, garlic, and ginger in ...",Main,High,90,No,Stewing,Hot,"Start by preparing niter kibbeh, a spiced clar..."
1,Injera,Ethiopia,"Teff flour, water, salt",Mix teff flour with water to create a batter. ...,Side,,10,Yes,Griddle,Room,Combine teff flour and water to form a batter....
2,Sushi,Japan,"Rice, nori, raw fish, vegetables",Prepare sushi rice and let cool. Place a sheet...,Main,,30,No,No-cook,Cold,"Rinse and cook sushi rice, season with vinegar..."
3,Tacos,Mexico,"Tortillas, beef, lettuce, tomato, cheese, salsa","Cook beef with spices. Warm tortillas, fill wi...",Main,Medium,20,No,Grilling,Hot,"Brown the ground beef with spices, warm the to..."
4,Paella,Spain,"Rice, saffron, chicken, seafood, vegetables","Cook chicken in a pan, add vegetables, then ri...",Main,Medium,45,No,Sautéing,Hot,"Sauté chicken until browned, add vegetables, t..."


In [33]:
df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]


In [34]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [35]:
df["how_to_make"].isna().value_counts()

how_to_make
False    235
Name: count, dtype: int64

In [36]:
df.shape

(235, 12)

In [37]:
df.drop_duplicates(subset=['name'], inplace=True)

In [41]:
df.head()

Unnamed: 0,name,country,ingredients,instructions,meal_type,spice_level,cooking_time_(minutes),vegetarian,main_cooking_method,serving_temperature,how_to_make,id
0,Doro Wat,Ethiopia,"Chicken, onions, garlic, ginger, berbere spice...","In a pot, sauté onions, garlic, and ginger in ...",Main,High,90,No,Stewing,Hot,"Start by preparing niter kibbeh, a spiced clar...",f2ff4980-fda2-4bfe-8a05-aca63be38345
1,Injera,Ethiopia,"Teff flour, water, salt",Mix teff flour with water to create a batter. ...,Side,,10,Yes,Griddle,Room,Combine teff flour and water to form a batter....,78c8ff6c-af26-4012-a637-af925fda17f7
2,Sushi,Japan,"Rice, nori, raw fish, vegetables",Prepare sushi rice and let cool. Place a sheet...,Main,,30,No,No-cook,Cold,"Rinse and cook sushi rice, season with vinegar...",3437bb23-0c42-4548-9165-f31f586b968f
3,Tacos,Mexico,"Tortillas, beef, lettuce, tomato, cheese, salsa","Cook beef with spices. Warm tortillas, fill wi...",Main,Medium,20,No,Grilling,Hot,"Brown the ground beef with spices, warm the to...",ec101289-7120-4818-99ec-40345acf99b8
4,Paella,Spain,"Rice, saffron, chicken, seafood, vegetables","Cook chicken in a pan, add vegetables, then ri...",Main,Medium,45,No,Sautéing,Hot,"Sauté chicken until browned, add vegetables, t...",8395a5cd-7bd2-45f7-a19d-c4a232d02fa4


In [42]:
df.shape

(166, 12)

In [40]:
df.to_csv('../data/clean.csv', index=False)

In [11]:
data = df.to_dict('records')
result = []
for row in data:
    country = row["Country"]

food = {
"Name": row["Name"],
"Ingredients": row["Ingredients"],
"Instructions": row["Instructions"],
"Meal Type": row["Meal Type"],
"Spice Level": row["Spice Level"],
"Cooking Time (minutes)": row["Cooking Time (minutes)"],
"Vegetarian": row["Vegetarian"],
"Main Cooking Method": row["Main Cooking Method"],
"Serving Temperature": row["Serving Temperature"],
"How to Make": row["How to Make"]
}

country_index = next((i for i, d in enumerate(result) if d["Country"] == country), None)
if country_index is not None:
    result[country_index]["food"].append(food)
else:
    result.append({"Country": country, "food": [food]})

print(result)

[{'Country': 'France', 'food': [{'Name': 'Creme Brulee', 'Ingredients': 'Cream, sugar, vanilla, egg yolks', 'Instructions': 'Bake custard, chill, top with sugar, and caramelize with a torch.', 'Meal Type': 'Dessert', 'Spice Level': nan, 'Cooking Time (minutes)': 45, 'Vegetarian': 'Yes', 'Main Cooking Method': 'Baking', 'Serving Temperature': 'Cold', 'How to Make': 'Whisk cream, sugar, vanilla, and egg yolks, bake in a water bath, chill, sprinkle with sugar, caramelize with a kitchen torch.'}]}]


### Embedding

In [4]:
df = pd.read_csv('/home/sam/Documents/projects/practice/courses/ChefAssistAI/data/clean.csv')


In [5]:
df.head()


Unnamed: 0,name,country,ingredients,instructions,meal_type,spice_level,cooking_time_(minutes),vegetarian,main_cooking_method,serving_temperature,how_to_make,id
0,Doro Wat,Ethiopia,"Chicken, onions, garlic, ginger, berbere spice...","In a pot, sauté onions, garlic, and ginger in ...",Main,High,90,No,Stewing,Hot,"Start by preparing niter kibbeh, a spiced clar...",f2ff4980-fda2-4bfe-8a05-aca63be38345
1,Injera,Ethiopia,"Teff flour, water, salt",Mix teff flour with water to create a batter. ...,Side,,10,Yes,Griddle,Room,Combine teff flour and water to form a batter....,78c8ff6c-af26-4012-a637-af925fda17f7
2,Sushi,Japan,"Rice, nori, raw fish, vegetables",Prepare sushi rice and let cool. Place a sheet...,Main,,30,No,No-cook,Cold,"Rinse and cook sushi rice, season with vinegar...",3437bb23-0c42-4548-9165-f31f586b968f
3,Tacos,Mexico,"Tortillas, beef, lettuce, tomato, cheese, salsa","Cook beef with spices. Warm tortillas, fill wi...",Main,Medium,20,No,Grilling,Hot,"Brown the ground beef with spices, warm the to...",ec101289-7120-4818-99ec-40345acf99b8
4,Paella,Spain,"Rice, saffron, chicken, seafood, vegetables","Cook chicken in a pan, add vegetables, then ri...",Main,Medium,45,No,Sautéing,Hot,"Sauté chicken until browned, add vegetables, t...",8395a5cd-7bd2-45f7-a19d-c4a232d02fa4


In [6]:
# configure client
pc = Pinecone(api_key=PINECONE_API_KEY)

In [7]:

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [11]:

# index_name = "langchain-retrieval-agent"
# index_name = 'semantic-search'
index_name = 'semantic-search-4'

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384, 
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [12]:
from tqdm.auto import tqdm
import json

batch_size = 10

for i in tqdm(range(0, len(df), batch_size)):
    # get end of batch
    i_end = min(len(df), i+batch_size)
    batch = df.iloc[i:i_end]
    
    # Combine all columns into a single text for each row
    documents = batch.apply(lambda row: ' '.join(row.astype(str).replace('nan', '')), axis=1).tolist()
    
    # Create document embeddings
    embeds = model.encode(documents)
    
    # Get IDs
    ids = batch['id'].tolist()
    # batch = batch.drop('id', axis=1)

    metadatas = batch.where(batch.notnull(), None).to_dict('records')


    for i, doc in enumerate(documents):
        metadatas[i]['text'] = doc
    
    # Create metadata, replacing NaN with None
    
    # Ensure all values are JSON serializable
    metadatas = [{k: (v if v is not None else "") for k, v in m.items()} for m in metadatas]

    vectors = [
        {
            'id': id,
            'values': embed.tolist(),
            'metadata': metadata
        }
        for id, embed, metadata in zip(ids, embeds, metadatas)
    ]
    
    # Add everything to pinecone
    # index.upsert(vectors=zip(ids, embeds, metadatas))
    index.upsert(vectors=vectors)

100%|██████████| 17/17 [01:11<00:00,  4.22s/it]


In [13]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 176}},
 'total_vector_count': 176}

In [114]:
query = "dish from anywhere that doesn't have spice and takes the longest"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)
xc

{'matches': [{'id': 'd846a7bf-4cf2-4880-97a0-c0d38066db24',
              'metadata': {'Cooking Time (minutes)': 30.0,
                           'Country': 'Malaysia',
                           'How to Make': 'Knead flour and water into a dough, '
                                          'flatten and fold with butter, cook '
                                          'on a hot skillet, serve with curry.',
                           'Ingredients': 'Flour, water, butter, curry',
                           'Instructions': 'Knead dough, flatten and cook on a '
                                           'skillet, serve with curry.',
                           'Main Cooking Method': 'Griddle',
                           'Meal Type': 'Side',
                           'Name': 'Roti Canai',
                           'Serving Temperature': 'Hot',
                           'Spice Level': '',
                           'Vegetarian': 'Yes'},
              'score': 0.431228131,
              'v

In [83]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

# Initialize the Pinecone vector store
text_field = "Name"  # the metadata field that contains our text
vectorstore = Pinecone(
    index,
    embeddings.embed_query,
    text_field
)

# Now you can use the vectorstore with LangChain
query = "dish from anywhere that doesn't have spice and takes the longest"
results = vectorstore.similarity_search_with_score(query, k=5)

for doc, score in results:
    print(f"Score: {score}")
    print(f"Name: {doc.metadata.get('Name', 'Unknown')}")
    print(f"Instructions: {doc.metadata.get('Instructions', 'No instructions available')}")
    print("---")

Score: 0.431514174
Name: Unknown
Instructions: Knead dough, flatten and cook on a skillet, serve with curry.
---
Score: 0.427877
Name: Unknown
Instructions: Marinate chicken in garlic, lemon, spices, and yogurt, grill.
---
Score: 0.424424708
Name: Unknown
Instructions: Cook chicken and sausage with vegetables and spices, add rice and broth, simmer until tender.
---
Score: 0.421465099
Name: Unknown
Instructions: Simmer kimchi with pork and tofu in broth until flavors meld.
---
Score: 0.42067948
Name: Unknown
Instructions: Blend chickpeas with herbs, spices, onion, and garlic. Form into balls and fry until golden brown.
---


In [85]:
query = "dish from anywhere that doesn't have spice and takes the longest"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)

# Print results
for result in xc['matches']:
    print(f"Score: {round(result['score'], 4)}")
    print(f"Name: {result['metadata'].get('Name', 'Unknown')}")
    print(f"Instructions: {result['metadata'].get('Instructions', 'No instructions available')}")
    print("---")

Score: 0.4315
Name: Roti Canai
Instructions: Knead dough, flatten and cook on a skillet, serve with curry.
---
Score: 0.4279
Name: Shish Taouk
Instructions: Marinate chicken in garlic, lemon, spices, and yogurt, grill.
---
Score: 0.4244
Name: Jambalaya
Instructions: Cook chicken and sausage with vegetables and spices, add rice and broth, simmer until tender.
---
Score: 0.4215
Name: Kimchi Jjigae
Instructions: Simmer kimchi with pork and tofu in broth until flavors meld.
---
Score: 0.4207
Name: Falafel
Instructions: Blend chickpeas with herbs, spices, onion, and garlic. Form into balls and fry until golden brown.
---


In [74]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['Instructions']}")

0.43: Knead dough, flatten and cook on a skillet, serve with curry.
0.43: Marinate chicken in garlic, lemon, spices, and yogurt, grill.
0.42: Cook chicken and sausage with vegetables and spices, add rice and broth, simmer until tender.
0.42: Simmer kimchi with pork and tofu in broth until flavors meld.
0.42: Blend chickpeas with herbs, spices, onion, and garlic. Form into balls and fry until golden brown.


In [78]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url='https://openrouter.ai/api/v1',
    model="meta-llama/llama-3.1-8b-instruct:free",
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [16]:
from langchain.llms import OpenAI
import requests

# First, let's define a function to query Pinecone
def query_pinecone(query, top_k=15):
    xq = model.encode(query).tolist()
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    results = []
    for match in xc.matches:
        result = {
            "id": match.id,
            "score": match.score,
            "metadata": match.metadata
        }
        print(result["score"])
        if result["score"] > 0.2:
            results.append(result)
    # print(results)
    return results


# Function to format dish info
def format_dish_info(dish):
    return "\n".join([f"{key}: {value}" for key, value in dish['metadata'].items() if value])

def query_openrouter(prompt):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}"
    }
    
    data = {
        "model": "meta-llama/llama-3.1-8b-instruct:free",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 150,
        "temperature": 0.0
    }
    
    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
    # print(response.json())
    return response.json()['choices'][0]['message']['content']

def llm(prompt, model_choice):
    start_time = time.time()
    if model_choice.startswith('openrouter'):  
        response = openrouter_client.chat.completions.create(
            messages=[{"role": "user","content": prompt}],
            model="meta-llama/llama-3.1-8b-instruct:free",
            )
        answer = response.choices[0].message.content
        tokens = {
            'prompt_tokens': response.usage.prompt_tokens,
            'completion_tokens': response.usage.completion_tokens,
            'total_tokens': response.usage.total_tokens
        }
    elif model_choice.startswith('groq'):
        response = groq_client.chat.completions.create(
            model="llama-3.1-70b-versatile",
            messages=[{"role": "user", "content": prompt}]
        )
        answer = response.choices[0].message.content
        tokens = {
            'prompt_tokens': response.usage.prompt_tokens,
            'completion_tokens': response.usage.completion_tokens,
            'total_tokens': response.usage.total_tokens
        }
    else:
        raise ValueError(f"Unknown model choice: {model_choice}")
    
    end_time = time.time()
    response_time = end_time - start_time
    
    return answer, tokens, response_time


def qa_function(question):
    # Query Pinecone
    results = query_pinecone(question)
    # print(results)
    if not results:
        return "I'm sorry, I couldn't find any relevant information to answer your question."
    
    # Format the dish information
    all_dish_info = "\n\n".join([format_dish_info(dish) for dish in results])

    print(all_dish_info)
    
    # Create the prompt
    prompt = f"""
    Based on the following information about a dish, please answer the question: {question}

    Dish information:
    {all_dish_info}

    Answer:
    """
    
    # Use OpenRouter to generate an answer
    response = llm(prompt, "groq")
    
    return response

# Test the QA function
question = "What's the dish that takes the longest from ethiopia"
answer = qa_function(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

0.572524846
0.520896554
0.51650095
0.478666425
0.478319049
0.465610057
0.465157926
0.458971769
0.44278127
0.433303028
0.430688202
0.421155035
0.418404967
0.409491956
0.406508654
cooking_time_(minutes): 20.0
country: Ethiopia
how_to_make: Sauté green beans, carrots, onions, and tomatoes with garlic and spices until vegetables are tender.
ingredients: Green beans, carrots, onions, tomatoes, garlic
instructions: Sauté vegetables with spices until tender, serve as a side dish.
main_cooking_method: Sautéing
meal_type: Side
name: Fasolia
serving_temperature: Hot
vegetarian: Yes

cooking_time_(minutes): 30.0
country: Ethiopia
how_to_make: Chop tripe, liver, and beef finely, sauté with onions, jalapenos, and spices until cooked through.
ingredients: Tripe, liver, minced beef, onions, jalapenos
instructions: Cook meats with spices and vegetables, serve hot.
main_cooking_method: Sautéing
meal_type: Main
name: Dulet
serving_temperature: Hot
spice_level: High
vegetarian: No

cooking_time_(minutes)

In [135]:
question = "What's is 2+2"
answer = qa_function(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

0.0771762729
0.0683583245
0.0574751757
0.047796838
0.0447519273
Question: What's is 2+2
Answer: I'm sorry, I couldn't find any relevant information to answer your question.
