# Visualize Embeddings

It is fun to visualize raw numbers

In [1]:
import os, sys
import pprint
import json
import time

## Runing on Colab?

The next will tell you.

In [2]:
import os

if os.getenv("COLAB_RELEASE_TAG"):
   print("Running in Colab")
   RUNNING_IN_COLAB = True
else:
   print("NOT in Colab")
   RUNNING_IN_COLAB = False

NOT in Colab


if you are running on COLAB, uncomment the following cell and execute.

We only have to do it once

In [3]:
# !pip install --upgrade openai  python-dotenv
# !pip show openai | grep Version

In [4]:
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

## Load Settings

In [5]:
## Load Settings from .env file

_ = load_dotenv(find_dotenv()) # read local .env file

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Manual override
# OPENAI_API_KEY="abc123"

# print ("OPENAI_API_KEY : ", OPENAI_API_KEY )


if not OPENAI_API_KEY:
    raise Exception ("'OPENAI_API_KEY' is not set.  Please set it above to continue...")


## Initialize OpenAI Clien

In [6]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key= os.environ.get("OPENAI_API_KEY"),
)


## Embeddings

Embeddings convert a word, say 'apple' into bunch of vectors/numbers, like [0.21,  -0.12,  0.33, ....]

Why? Because ML models expect numbers.


In [7]:
# https://platform.openai.com/docs/guides/embeddings/what-are-embeddings

def generate_embedding(text: str,  model="text-embedding-ada-002") -> list[float]:
    text = text.replace("\n", " ")
    resp = client.embeddings.create (
		input=[text],
		model=model  )

    return resp.data[0].embedding

In [8]:
word = 'apple'
model = 'text-embedding-ada-002'
embedding = generate_embedding(word, model)
print (f"Embedding for word='{word}' using model='{model}', embeddding_length={len(embedding)}, printing first few numbers... :\n", embedding [:10] )

Embedding for word='apple' using model='text-embedding-ada-002', embeddding_length=1536, printing first few numbers... :
 [0.00777884665876627, -0.023069249466061592, -0.007360776886343956, -0.02774341218173504, -0.00457478454336524, 0.012891639955341816, -0.021863015368580818, -0.00858757272362709, 0.01892966963350773, -0.029854323714971542]


In [9]:
word = 'apple'
model = 'text-similarity-babbage-001'
embedding = generate_embedding(word, model)
print (f"Embedding for word='{word}' using model='{model}', embeddding_length={len(embedding)}, printing first few numbers... :\n", embedding [:10] )

Embedding for word='apple' using model='text-similarity-babbage-001', embeddding_length=2048, printing first few numbers... :
 [0.010828019119799137, -0.002828809432685375, -0.00380324712023139, 0.017472675070166588, 0.018480714410543442, 0.00637164618819952, 0.010273597203195095, 0.006980670150369406, 0.006073434837162495, -0.03971673548221588]


## Visualize Embeddings

Let's try to visualize numbers

In [10]:
import pandas as pd

df_words = pd.DataFrame({'word' : ['apple', 'banana', 'earth', 'sun', 'violin', 'flute',  'mars', 'drums', 'cake', 'ice cream', 'pie']})

df_words

Unnamed: 0,word
0,apple
1,banana
2,earth
3,sun
4,violin
5,flute
6,mars
7,drums
8,cake
9,ice cream


In [15]:
# find embeddings to words
import numpy as np

file='word-embeddings.csv'

if os.path.exists(file):
    df_words = pd.read_csv(file)
    print (df_words.dtypes)
    # df_words['embedding'] = df_words['embedding'].to_numpy()
    # df_words['embedding'] = df_words['embedding'].astype(float).values
    # print (df_words.dtypes)
    df_words['embedding'] = df_words['embedding'].apply(eval).apply(np.array)
    # df_words['embedding'] = df_words['embedding'].apply(lambda x: np.array(eval(x)), 0)
    print (df_words.dtypes)
else:
    # df_words['embedding'] = df_words.apply (lambda row: row['word'].upper(), axis=1)
    df_words['embedding'] = df_words.apply(lambda row: generate_embedding(row['word'], model='text-embedding-ada-002'), axis=1)
    # df_words['embedding'] = df_words['embedding'].to_numpy()
    df_words['embedding'] = df_words['embedding'].apply(eval).apply(np.array)
    # print (df_words.dtypes)
    df_words.to_csv(file, index=False)

# print (df_words.dtypes)
df_words

word         object
embedding    object
dtype: object
word         object
embedding    object
dtype: object
         word                                          embedding
0       apple  [0.00777884665876627, -0.023069249466061592, -...
1      banana  [-0.013926557265222073, -0.03288617357611656, ...
2       earth  [0.017013944685459137, -0.012805378064513206, ...
3         sun  [0.02472955547273159, -0.0024875381495803595, ...
4      violin  [-0.025621239095926285, -0.004035905934870243,...
5       flute  [-0.03577222302556038, -0.008089736104011536, ...
6        mars  [0.0049131098203361034, -0.0017529274336993694...
7       drums  [-0.023547517135739326, -0.0038533478509634733...
8        cake  [-0.013684120960533619, -0.016856111586093903,...
9   ice cream  [0.00232032616622746, -0.01688331924378872, 0....
10        pie  [-0.006975209806114435, -0.025179490447044373,...


Unnamed: 0,word,embedding
0,apple,"[0.00777884665876627, -0.023069249466061592, -..."
1,banana,"[-0.013926557265222073, -0.03288617357611656, ..."
2,earth,"[0.017013944685459137, -0.012805378064513206, ..."
3,sun,"[0.02472955547273159, -0.0024875381495803595, ..."
4,violin,"[-0.025621239095926285, -0.004035905934870243,..."
5,flute,"[-0.03577222302556038, -0.008089736104011536, ..."
6,mars,"[0.0049131098203361034, -0.0017529274336993694..."
7,drums,"[-0.023547517135739326, -0.0038533478509634733..."
8,cake,"[-0.013684120960533619, -0.016856111586093903,..."
9,ice cream,"[0.00232032616622746, -0.01688331924378872, 0...."


## Visualize

In [19]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Using PCA to reduce to 2 dimensions
pca = PCA(n_components=2)

matrix = df_words ['embedding'].apply(eval).to_list()

df_words ['reduced_embedding'] = pca.fit_transform(df_words ['embedding'])

# Alternatively, you can use t-SNE
# from sklearn.manifold import TSNE
# reduced_embeddings = TSNE(n_components=2).fit_transform(embeddings)

df_words

ValueError: setting an array element with a sequence.

In [22]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib

matrix = df_words['embedding'].apply(eval).to_list()
print (matrix)
# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)

# Plotting the reduced embeddings
plt.figure(figsize=(10, 10))
plt.scatter(vis_dims[:, 0], vis_dims[:, 1])
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('2D visualization of Embeddings')
plt.show()


ValueError: source code string cannot contain null bytes