# Introduction to Embedding Vectors and the RAG Pattern

## Embeddings Basics

In [None]:
# Install OpenAI API
!pip install -q openai levenshtein > /dev/null

In [None]:
from openai import OpenAI
from google.colab import userdata

# Create OpenAI client
client = OpenAI(
    api_key=userdata.get('openaiKey'),
)

# Define a helper function to calculate embeddings
def get_embedding_vec(input):
  """Returns the embeddings vector for a given input"""
  return client.embeddings.create(
        input=input,
        model="text-embedding-ada-002"
    ).data[0].embedding

In [None]:
import numpy as np

# Calculate the embedding vector for a sample sentence
vec = get_embedding_vec("King")
print(vec[:10])

# Calculate the magnitude of the vector. I should be 1 as
# embedding vectors from OpenAI are always normalized.
magnitude = np.linalg.norm(vec)
magnitude

In [None]:
import numpy as np
from Levenshtein import distance

# Compare two sentences with the same meaning, but in different languages
s1 = "The king is dead, long live the king"
s2 = "Der König ist tot, lang lebe der König"
print(f"a) Similarity: {np.dot(get_embedding_vec(s1), get_embedding_vec(s2)): .4f}, Levenshtein distance: {distance(s1, s2)}")

s1 = "The king who is alive is better than a dead one"
s2 = "Ein König der lebt, ist besser als ein toter König"
print(f"b) Similarity: {np.dot(get_embedding_vec(s1), get_embedding_vec(s2)): .4f}, Levenshtein distance: {distance(s1, s2)}")

# Compare two sentences with similar words, but different meaning
s1 = "The king is dead, long live the king"
s2 = "The king is alive, that's better than a dead king"
print(f"c) Similarity: {np.dot(get_embedding_vec(s1), get_embedding_vec(s2)): .4f}, Levenshtein distance: {distance(s1, s2)}")

# Compare two sentences with similar words, but different meaning and languages
s1 = "The king is dead, long live the king"
s2 = "Ein König der lebt, ist besser als ein toter König"
print(f"d) Similarity: {np.dot(get_embedding_vec(s1), get_embedding_vec(s2)): .4f}, Levenshtein distance: {distance(s1, s2)}")


## Vector Search

In [None]:
# Array with descriptions about cities. They could be used in e.g. a travel
# agency to find suitable spots for vacations
cities = []
cities.append("Emeraldine: A bustling metropolis surrounded by lush forests, known for its towering skyscrapers and vibrant night markets.")
cities.append("Solara: A small, sun-drenched coastal town famous for its golden beaches, seafood cuisine, and laid-back lifestyle.")
cities.append("Nebulae: A futuristic city with floating buildings and neon lights, renowned for its advanced technology and AI-driven services.")
cities.append("Auroria: A serene mountain city, hidden in misty peaks, with ancient monasteries and breathtaking hiking trails.")
cities.append("Thalassa: An island city with a rich maritime history, surrounded by crystal clear waters, perfect for scuba diving and sailing.")
cities.append("Cinderpeak: A city built around an active volcano, known for its unique architecture, geothermal energy, and vibrant arts scene.")
cities.append("Vespera: A city that never sleeps, with a bustling nightlife, cultural festivals, and a diverse culinary scene, under a starlit sky.")
cities.append("Windmere: A small town on the plains, famous for its windmills, open fields, and a tight-knit community with traditional values.")
cities.append("Polaria: An isolated city in the far north, known for its ice castles, aurora borealis views, and resilient, warm-hearted residents.")
cities.append("Glimmerdale: A city in a valley, illuminated by bioluminescent plants, known for its sustainable living and harmony with nature.")


In [None]:
# Let's calculate the embedding vectors of all cities.
# NOTE that in real-world applications, you would store the embeddings
# in a vector DB like Pinecone, Qdrant, Azure Search, etc.
embeddings = []
for city in cities:
  embeddings.append((city, get_embedding_vec(city)))

In [None]:
# Enter the search text of the customer how is looking for a vacation spot.
#query = "For vacation, I want to go hiking in the mountains. I want to really feel nature."
#query = "In my vacation, I want to party, party, party!"
#query = "I want to travel to an underwater city"
query = "What is the name of Bart Simpson's sister?"

# Calculate the embedding vector of the search text
query_embedding = get_embedding_vec(query)

In [None]:
import numpy as np

sorted_result = []

# Iterate over all cities and calculate the similarity (dot product) of
# the city description and the search text.
for city, embedding in embeddings:
  similarity = np.dot(embedding, query_embedding)
  sorted_result.append((city, embedding, similarity))

# We sort the result descending based on the similarity so that the top
# elements are probably more relevant than the last ones.
sorted_result = sorted(sorted_result, key=lambda x: x[2], reverse=True)
for tuple in sorted_result:
    print(tuple[2], tuple[0])


In [None]:
import matplotlib.pyplot as plt
import math

cities = [item[0].split(":", 1)[0] for item in sorted_result]
similarities = [item[2] for item in sorted_result]

# Visualize the sorted result in a bar chart
plt.bar(cities, similarities, color='lime')
plt.ylabel('Similarity')
plt.ylim(
    math.floor(min(similarities) * 100) / 100,
    math.ceil(max(similarities) * 100) / 100)
plt.xticks(rotation=90)
plt.show()

## Generate Response

In [None]:
from string import Template

t = Template("""
You are a helpful assistant in a travel agency. Customers are describing
what they want to do in their vacation. Make suggestions based on the
city descriptions provided below. ONLY use the provided city descriptions.
Do NOT use other information sources.

If you cannot generate a meaningful answer based on the given city description,
say "Sorry, I cannot help". If the user's input is not related to finding
a travel location, say "Sorry, I can only help with vacation locations".

===========
$options
===========
""")

system_prompt = t.substitute(options = "\n\n".join([item[0] for item in sorted_result[:3]]))
print(system_prompt)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": query,
        }
    ],
    model="gpt-4-1106-preview",
)
chat_completion.choices[0].message.content