# Embeddings

This script demonstrates how to visualise the semantic relationships between words using word embeddings and Principal Component Analysis (PCA). First, each word from the input list is converted into a high-dimensional numerical vector (an embedding) using a Large Language Model. These embeddings capture the word's meaning, so words with similar contexts are represented by similar vectors. Because it's impossible to plot data with thousands of dimensions, PCA is then used to compress this complex information into just two dimensions. The final scatter plot displays these 2D representations, visually confirming the model's understanding of language by showing that words with similar meanings (e.g., all the fruits or all the animals) appear clustered together.

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

from llama_cpp import Llama

In [None]:
MODEL_ROOT = Path("../llama-cpp-python/models")
assert MODEL_ROOT.exists()

In [None]:
model_path = MODEL_ROOT / "text_gen/bert-base-uncased-Q8_0.gguf"
assert model_path.exists()

In [None]:
llm = Llama(model_path=str(model_path), embedding=True, n_gpu_layers=-1)

In [None]:
embeddings = llm.create_embedding("Hello, world!")

In [None]:
np.array(embeddings["data"][0]["embedding"]).shape

In [None]:
words_and_phrases = [
    # Fruits
    "apple", "banana", "orange", "strawberry", "grape",
    # Animals
    "dog", "cat", "lion", "tiger", "elephant",
    # Technology
    "computer", "smartphone", "internet", "keyboard", "software",
    # Weather
    "rain", "sunshine", "snow", "wind", "cloud",
    # Musical Instruments
    "guitar", "piano", "violin", "drums", "trumpet",
    # Clothing
    "shirt", "trousers", "jacket", "shoes", "hat"
]

In [None]:
all_embeddings = [] # Create an empty list to store the results

# Loop through each item in your list
for item in words_and_phrases:
    # Get the embedding for a single item
    embedding_data = llm.create_embedding(item)

    # Extract the vector and append it to our list
    embedding_vector = np.array(embedding_data['data'][0]['embedding'])
    all_embeddings.append(embedding_vector.mean(0))

# Convert the list of embeddings into a single NumPy array after the loop
embeddings = np.array(all_embeddings)

print(f"Embeddings generated with shape: {embeddings.shape}")

In [None]:
# --- 4. Apply PCA to Reduce Dimensions ---
# We are reducing the ~4096 dimensions of the embeddings down to 2.
print("--- Applying PCA to reduce dimensions to 2D ---")
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)
print(f"Embeddings reduced to shape: {embeddings_2d.shape}")

In [None]:
# --- 5. Plot the Results ---
print("--- Plotting results ---")
plt.figure(figsize=(14, 10))

# Create a scatter plot of the 2D embeddings
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

# Add labels to each point on the plot
for i, word in enumerate(words_and_phrases):
    plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=12)

plt.title('2D PCA of Word and Phrase Embeddings', fontsize=16)
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.grid(True)
plt.show()