In [3]:
from llama_index.embeddings.openai import OpenAIEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Define the trivia questions and matching answers
phrases = [
    "What spacecraft was used in the mission to carry the first humans to the moon?",  # Question
    "Apollo 11 was the spacecraft used to carry the first humans to the moon.",       # Correct Answer
    "Apollo 12 was the spacecraft used to carry the first humans to the moon.",         # Wrong Answer
    "Apollo 14 was the spacecraft used to carry astronauts on the third successful moon landing mission.", # Wrong Answer
    "Apollo 10 was the spacecraft used to carry the first humans to the moon.", # Wrong Answer
    "Apollo 16 was the spacecraft that carried astronauts to explore the lunar highlands."   # Wrong Answer
]

# Generate embeddings for each phrase using OpenAI embeddings
embeddings = embedding_model.get_text_embedding_batch(phrases)

# Convert embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {phrases[0]}")
print(f"First 5 elements of its embedding: {embeddings_array[0][:5]}\n")

# Compute cosine similarity between the embeddings
similarity_matrix = cosine_similarity(embeddings_array)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(np.round(similarity_matrix, 2))
print("\nDetailed Similarity Results:\n")

# Output comparison between phrases with improved readability
for i in range(len(phrases)):
    for j in range(i + 1, len(phrases)):
        print(f"Cosine similarity between:\n  '{phrases[i]}'\n  and\n  '{phrases[j]}'\n  => {similarity_matrix[i, j]:.4f}\n")

Phrase: What spacecraft was used in the mission to carry the first humans to the moon?
First 5 elements of its embedding: [ 0.02234936 -0.01276388  0.02098001 -0.01151388 -0.0080214 ]

Cosine Similarity Matrix:
[[1.   0.93 0.93 0.89 0.93 0.89]
 [0.93 1.   0.97 0.93 0.97 0.91]
 [0.93 0.97 1.   0.93 0.96 0.92]
 [0.89 0.93 0.93 1.   0.93 0.93]
 [0.93 0.97 0.96 0.93 1.   0.91]
 [0.89 0.91 0.92 0.93 0.91 1.  ]]

Detailed Similarity Results:

Cosine similarity between:
  'What spacecraft was used in the mission to carry the first humans to the moon?'
  and
  'Apollo 11 was the spacecraft used to carry the first humans to the moon.'
  => 0.9344

Cosine similarity between:
  'What spacecraft was used in the mission to carry the first humans to the moon?'
  and
  'Apollo 12 was the spacecraft used to carry the first humans to the moon.'
  => 0.9309

Cosine similarity between:
  'What spacecraft was used in the mission to carry the first humans to the moon?'
  and
  'Apollo 14 was the spacecraft