# Tokens & Embeddings

## Getting started with text embeddings in Ollama

In [1]:
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
HOST = "http://localhost:11434"
MODEL = "gemma:7b"

In [4]:
embeddings_model = OllamaEmbeddings(base_url=HOST, model=MODEL)

In [5]:
text = "What is the meaning of life?"

In [7]:
embedding = embeddings_model.embed_query(text)

In [12]:
vector = embedding
print(f"Length = {len(vector)}")

# print the first 10 values of the embeding vector
print(vector[:10])

Length = 3072
[-5.027745246887207, 0.5985317230224609, -2.0808115005493164, 2.8571624755859375, 2.2468223571777344, -0.19264957308769226, 0.4538598656654358, 0.11579719185829163, 0.08520935475826263, 2.153249502182007]


In [17]:
doc_result = embeddings_model.embed_documents([text])
print(doc_result)

[[-6.120396137237549, 1.484634518623352, -1.1042202711105347, 3.513516902923584, 1.6334097385406494, -0.7936108112335205, 1.6353179216384888, -0.5915756821632385, 0.8711243271827698, 1.4186745882034302, 1.3845107555389404, -0.26937273144721985, -6.5539021492004395, 1.429795503616333, 4.799867630004883, -1.3334236145019531, -0.5572676062583923, -1.330641508102417, -3.6010806560516357, -0.6466148495674133, -4.935181617736816, -8.72292709350586, -0.07556198537349701, -0.9008309245109558, 1.314850091934204, -1.8597841262817383, -1.3284200429916382, -3.7722904682159424, -1.0477819442749023, -18.511066436767578, -4.935744762420654, -1.5323985815048218, 1.220404863357544, 2.3818893432617188, 4.07875919342041, 0.7853652834892273, -0.718835711479187, -1.5524001121520996, 1.214336633682251, 0.5795365571975708, 3.153724193572998, 0.35583847761154175, 0.07414151728153229, 0.5738590955734253, 4.053811073303223, 2.6252329349517822, 3.172889471054077, 2.256565570831299, 2.6123969554901123, -0.7758904

In [13]:
text_2 = "I like vegetarian food"
embedding_2 = embeddings_model.embed_query(text_2)
text_3 = "The red car drives faster than the white one"
embedding_3 = embeddings_model.embed_query(text_3)


## Vector similarity

In data analysis, cosine similarity is a measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors.
The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 indicating orthogonality or decorrelation, while in-between values indicate intermediate similarity or dissimilarity.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [27]:
vector =  np.array(embedding).reshape(1, -1)
vector_2 = np.array(embedding_2).reshape(1, -1)
vector_3 = np.array(embedding_3).reshape(1, -1)

In [28]:
print(cosine_similarity(vector,vector_2)) 
print(cosine_similarity(vector_2,vector_3))
print(cosine_similarity(vector,vector_3))

[[0.69865743]]
[[0.96995033]]
[[0.73704656]]


## Visualizing embeddings

In [30]:
embeddings_array = np.array([vector, vector_2, vector_3])

In [31]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

Shape: (3, 1, 3072)
[[[ -5.02774525   0.59853172  -2.0808115  ...   1.20029747   4.27652216
     0.45921668]]

 [[-11.7235918    0.6632418   -0.92427802 ...   0.16894771   0.59959799
     0.57232803]]

 [[-11.93867111  -0.97636181  -2.05400562 ...   0.04967426  -0.89528406
     1.57067466]]]


In [32]:
from sklearn.decomposition import PCA

In [33]:
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)

ValueError: Found array with dim 3. PCA expected <= 2.

In [None]:
print("Shape: " + str(new_values.shape))
print(new_values)

In [None]:
import matplotlib.pyplot as plt
import mplcursors
%matplotlib ipympl

from utils import plot_2D
plot_2D(new_values[:,0], new_values[:,1], input_text_lst_news)