# Introduction to Embedding Vectors

## Embeddings Basics

In [None]:
# Install OpenAI API
!pip install openai
!pip install tree-sitter tree-sitter-python

In [None]:
from openai import OpenAI
from google.colab import userdata

# Create OpenAI client
client = OpenAI(
    api_key=userdata.get('openaiKey'),
)

# Define a helper function to calculate embeddings
def get_embedding_vec(input):
  """Returns the embeddings vector for a given input"""
  return client.embeddings.create(
        input=input,
        model="text-embedding-3-large"
    ).data[0].embedding

In [None]:
import numpy as np

# Calculate the embedding vector for a sample sentence
vec = get_embedding_vec("The quick brown fox jumps over the lazy dog")
print(vec[:10])

# Calculate the magnitude of the vector. I should be 1 as
# embedding vectors from OpenAI are always normalized.
magnitude = np.linalg.norm(vec)
magnitude

In [None]:
sentence1 = "The weather is nice"
sentence2 = "What a wonderful sunny day at the beach"
sentence3 = "I am going to the gym today"
vec1 = get_embedding_vec(sentence1)
vec2 = get_embedding_vec(sentence2)
vec3 = get_embedding_vec(sentence3)

similarity1 = np.dot(vec1, vec2)
similarity2 = np.dot(vec1, vec3)

print(f"similarity1 = {similarity1}")
print(f"similarity2 = {similarity2}")
if similarity1 > similarity2:
    print(f"The similarity between '{sentence1}' and '{sentence2}' is higher")
else:
    print(f"The similarity between '{sentence1}' and '{sentence3}' is higher")

## Tree Sitter

In [None]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

# Initialize parser with Python grammar
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)

def extract_methods(source_code: str) -> list[dict]:
    """
    Extract all function/method definitions from Python source code.
    Returns list of dicts with name, body, start_line, end_line.
    """
    source_bytes = bytes(source_code, "utf8")
    tree = parser.parse(source_bytes)
    root_node = tree.root_node

    methods = []

    def traverse(node):
        if node.type == "function_definition":
            name_node = node.child_by_field_name("name")
            func_name = source_bytes[name_node.start_byte:name_node.end_byte].decode('utf8')

            # Extract full function body from bytes
            func_body = source_bytes[node.start_byte:node.end_byte].decode('utf8')

            methods.append({
                "name": func_name,
                "body": func_body,
                "start_line": node.start_point[0] + 1,
                "end_line": node.end_point[0] + 1,
                "start_byte": node.start_byte,
                "end_byte": node.end_byte
            })

        for child in node.children:
            traverse(child)

    traverse(root_node)
    return methods

# Demo code with multiple methods
sample_code = '''
def calculate_pizza_slices_per_person(num_people, num_pizzas, slices_per_pizza=8):
    """
    Calculate fair pizza distribution at a party.
    Handles the classic edge case: what if someone is vegetarian?
    """
    total_slices = num_pizzas * slices_per_pizza
    slices_per_person = total_slices // num_people
    leftover_slices = total_slices % num_people
    return slices_per_person, leftover_slices

def is_vampire_number(n):
    """
    Check if a number is a vampire number (1260 = 21 × 60, using digits 1,2,6,0).
    A vampire number has an even number of digits and can be factored into two
    "fangs" that together use exactly the original digits.
    """
    s = str(n)
    if len(s) % 2 != 0:
        return False

    half_len = len(s) // 2
    for i in range(10**(half_len-1), 10**half_len):
        if n % i == 0:
            j = n // i
            if len(str(j)) == half_len:
                if sorted(s) == sorted(str(i) + str(j)):
                    return True, i, j
    return False, None, None

def common_off_by_one_error_example(items):
    """
    CLASSIC BUG: This function demonstrates the infamous off-by-one error
    when iterating. It attempts to process pairs of consecutive items but
    will crash with IndexError!
    """
    result = []
    for i in range(len(items)):  # BUG: should be range(len(items) - 1)
        pair = (items[i], items[i + 1])  # IndexError on last iteration!
        result.append(pair)
    return result

def calculate_doomsday_for_year(year):
    """
    Conway's Doomsday algorithm: calculate what day of week any date falls on.
    The "doomsday" is a day that lands on the same weekday every year
    (e.g., 4/4, 6/6, 8/8, 10/10, 12/12, and last day of February).
    """
    # Anchor: Tuesday (2) for 2000-2099
    anchor = 2
    y = year % 100
    doomsday = (anchor + y + y // 4) % 7
    return ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"][doomsday]

def solve_fizzbuzz_with_walrus(n):
    """
    FizzBuzz using Python's walrus operator (:=) for extra style points.
    Demonstrates the := operator which assigns and returns value.
    """
    return [
        result if (result := (
            "FizzBuzz" if i % 15 == 0 else
            "Fizz" if i % 3 == 0 else
            "Buzz" if i % 5 == 0 else
            str(i)
        )) else result
        for i in range(1, n + 1)
    ]

def decode_leetspeak(text):
    """
    Decode 1337 speak (leetspeak) to normal text.
    Classic internet culture: 1=i, 3=e, 4=a, 7=t, 0=o, etc.
    """
    leet_map = {
        '0': 'o', '1': 'i', '3': 'e', '4': 'a',
        '5': 's', '7': 't', '8': 'b', '9': 'g'
    }
    return ''.join(leet_map.get(c, c) for c in text.lower())

def banana_index_confusion():
    """
    GOTCHA: String indexing confusion that catches beginners.
    In "banana", finding 'a' returns index 1 (first occurrence),
    but there are THREE 'a's at indices 1, 3, 5!
    """
    word = "banana"
    # This only finds FIRST occurrence
    first_a = word.index('a')  # returns 1
    # To find ALL occurrences, you need:
    all_a_indices = [i for i, c in enumerate(word) if c == 'a']
    return first_a, all_a_indices  # 1, [1, 3, 5]

def calculate_sleepy_programmer_coffee(hours_coding, coffee_strength="medium"):
    """
    Calculate required coffee intake for late-night coding sessions.
    Uses the "Ballmer Peak" coefficient (https://xkcd.com/323/).
    """
    base_cups = hours_coding / 2
    strength_multiplier = {"weak": 1.5, "medium": 1.0, "strong": 0.7}
    required_cups = base_cups * strength_multiplier.get(coffee_strength, 1.0)

    if required_cups > 10:
        return "⚠️  WARNING: Entering jitter zone. Please sleep instead."
    return f"{required_cups:.1f} cups needed"

def reverse_string_with_extended_slice():
    """
    The Pythonic way to reverse a string using extended slice [::-1].
    Many beginners try loops or reversed() when this is the idiomatic way.
    """
    text = "Hello, World!"
    # The "step" parameter of -1 reverses the sequence
    return text[::-1]  # "!dlroW ,olleH"

def mutable_default_argument_gotcha(item, my_list=[]):
    """
    CLASSIC PYTHON GOTCHA: Mutable default arguments are shared between calls!
    The empty list [] is created ONCE at function definition, not per call.
    This causes bizarre bugs where the list "remembers" previous calls.
    """
    my_list.append(item)
    return my_list
    # Correct fix: use my_list=None, then: if my_list is None: my_list = []
'''

# Extract all methods
methods = extract_methods(sample_code)

print(f"Found {len(methods)} methods:\n")
for i, method in enumerate(methods, 1):
    print(f"{i}. {method['name']} (lines {method['start_line']}-{method['end_line']})")
    print(f"   First 100 chars: {method['body'][:100]}...")
    print()


## Vector Search

In [None]:
# Let's calculate the embedding vectors of all methods.
# NOTE that in real-world applications, you would store the embeddings
# in a vector DB like Pinecone, Qdrant, Azure Search, etc.
embeddings = []
method_bodies = [method['body'] for method in methods]
for m in method_bodies:
  embeddings.append((m, get_embedding_vec(m)))

In [None]:
# Enter the search text of the customer how is asking a code-related question.
# query = "Somewhere in this code we use the walrus operator. Describe where and how"
# query = "What are vampire numbers and how do I check for them?"
# query = "What are the names of Bart Simpson's sisters"
query = "How does the Dijkstra algorithm work"

# Calculate the embedding vector of the search text
query_embedding = get_embedding_vec(query)

In [None]:
import numpy as np

sorted_result = []

# Iterate over all methods and calculate the similarity (dot product) of
# the method body and the search text.
for m, embedding in embeddings:
  similarity = np.dot(embedding, query_embedding)
  sorted_result.append((m, embedding, similarity))

# We sort the result descending based on the similarity so that the top
# elements are probably more relevant than the last ones.
sorted_result = sorted(sorted_result, key=lambda x: x[2], reverse=True)
for tuple in sorted_result:
    print(tuple[2], tuple[0][:100])


In [None]:
import matplotlib.pyplot as plt
import math

methods = [item[0].split(":", 1)[0] for item in sorted_result]
similarities = [item[2] for item in sorted_result]

# Visualize the sorted result in a bar chart
plt.bar(methods, similarities, color='lime')
plt.ylabel('Similarity')
plt.ylim(
    math.floor(min(similarities) * 100) / 100,
    math.ceil(max(similarities) * 100) / 100)
plt.xticks(rotation=90)
plt.show()

## Generate Response

In [None]:
from string import Template
from IPython.display import display, Markdown

t = Template("""
You are a helpful coding assistant. Users are having questions regarding
their codebase. Come up with an answer based on the method bodies below.
ONLY use the provided method bodies. Do NOT use other information sources.

If you cannot generate a meaningful answer based on the given methods,
say "Sorry, I cannot help". If the user's input is not related to answering
code-related questions, say "Sorry, I can only help with coding".

===========
$options
===========
""")

system_prompt = t.substitute(options = "\n\n".join([item[0] for item in sorted_result[:3]]))
# print(system_prompt)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": query,
        }
    ],
    model="gpt-5.2",
)
display(Markdown(chat_completion.choices[0].message.content))