# Similarity
https://cohere.com/llmu/what-is-similarity-between-sentences

In [None]:
import numpy as np
import seaborn as sns
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity

import os
from dotenv import load_dotenv
import cohere
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl

load_dotenv()
co = cohere.ClientV2(os.environ['COHERE_APIKEY'])

In [None]:
texts = ["I like to be in my house", 
         "I enjoy staying home", 
         "the isotope 238u decays to 206pb"]

response = co.embed(
    texts=texts,
    model='embed-v4.0',
    input_type='search_document',
    embedding_types=['float']
)

In [None]:
embeddings = response.embeddings.float

[sentence1, sentence2, sentence3] = embeddings

print("Embedding for sentence 1", np.array(sentence1))
print("Embedding for sentence 2", np.array(sentence2))
print("Embedding for sentence 3", np.array(sentence3))

# Dot Product Similarity

In [None]:
print("Similarity between sentences 1 and 2:", np.dot(sentence1, sentence2))
print("Similarity between sentences 1 and 3:", np.dot(sentence1, sentence3))
print("Similarity between sentences 2 and 3:", np.dot(sentence2, sentence3))

In [None]:
print("Similarity between sentences 1 and 1:", np.dot(sentence1, sentence1))
print("Similarity between sentences 1 and 2:", np.dot(sentence2, sentence2))
print("Similarity between sentences 3 and 3:", np.dot(sentence3, sentence3))

In [None]:
# Plotting with matplotlib
size = len(embeddings)
data = np.arange(size * size, dtype=float).reshape(size, size)
for i in range(size):
    for j in range(i, size):
        similarity = np.dot(embeddings[i], embeddings[j])
        data[i, j] = similarity
        data[j, i] = similarity

fig, ax = plt.subplots()
im = ax.imshow(data)

ax.set_xticks(range(len(texts)), labels=texts,
              rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(range(len(texts)), labels=texts,
              rotation=0, ha="right", rotation_mode="anchor")
for i in range(len(texts)):
    for j in range(len(texts)):
        text = ax.text(j, i, f'{data[i, j]:.2f}',
                       ha="center", va="center", color="w")

In [None]:
# Get pairwise dot product similarities
dot_product_similarities = [[cosine_similarity([embeddings[i]], [embeddings[j]])[0][0] for i in range(len(embeddings))] for j in range(len(embeddings))]

# Plot in 3x3 grid
ax = sns.heatmap(dot_product_similarities, vmin=0, vmax=1,
                 linewidths=1, linecolor='grey',
                 xticklabels=texts,
                 yticklabels=texts,
)
ax.set_xticklabels(labels=texts, rotation=45)

# Cosine Similarity

In [None]:
print("Cosine similarity between sentences 1 and 2:", cosine_similarity([sentence1], [sentence2])[0][0])  
print("Cosine similarity between sentences 1 and 3:", cosine_similarity([sentence1], [sentence3])[0][0])  
print("Cosine similarity between sentences 2 and 3:", cosine_similarity([sentence2], [sentence3])[0][0])

In [None]:
print("Cosine similarity between sentences 1 and 1:", cosine_similarity([sentence1], [sentence1])[0][0])  
print("Cosine similarity between sentences 2 and 2:", cosine_similarity([sentence2], [sentence2])[0][0])  
print("Cosine similarity between sentences 3 and 3:", cosine_similarity([sentence3], [sentence3])[0][0])