In [1]:
from llama_index.core.evaluation import SemanticSimilarityEvaluator

evaluator = SemanticSimilarityEvaluator()

In [2]:
# This evaluator only uses `response` and `reference`, passing in query does not influence the evaluation
# query = 'What is the color of the sky'

response = "The sky is typically blue"
reference = """The color of the sky can vary depending on several factors, including time of day, weather conditions, and location.

During the day, when the sun is in the sky, the sky often appears blue. 
This is because of a phenomenon called Rayleigh scattering, where molecules and particles in the Earth's atmosphere scatter sunlight in all directions, and blue light is scattered more than other colors because it travels as shorter, smaller waves. 
This is why we perceive the sky as blue on a clear day.
"""

result = await evaluator.aevaluate(
    response=response,
    reference=reference,
)

In [3]:
print("Score: ", result.score)
print("Passing: ", result.passing)  # default similarity threshold is 0.8

Score:  0.8741614884630504
Passing:  True


In [4]:
response = "Sorry, I do not have sufficient context to answer this question."
reference = """The color of the sky can vary depending on several factors, including time of day, weather conditions, and location.

During the day, when the sun is in the sky, the sky often appears blue. 
This is because of a phenomenon called Rayleigh scattering, where molecules and particles in the Earth's atmosphere scatter sunlight in all directions, and blue light is scattered more than other colors because it travels as shorter, smaller waves. 
This is why we perceive the sky as blue on a clear day.
"""

result = await evaluator.aevaluate(
    response=response,
    reference=reference,
)

In [5]:
print("Score: ", result.score)
print("Passing: ", result.passing)  # default similarity threshold is 0.8

Score:  0.7213441101430748
Passing:  False


In [23]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the embedding model from Hugging Face
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Sample data
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast, dark-colored fox leaps over a sleepy canine.",
    "The lazy dog was jumped over by a quick brown fox."
]

# Compute embeddings
embeddings = model.encode(sentences)

# Calculate cosine similarity matrix
cosine_sim_matrix = util.cos_sim(embeddings, embeddings)

print("Cosine Similarity Matrix:")
print(cosine_sim_matrix)

Cosine Similarity Matrix:
tensor([[1.0000, 0.7915, 0.9290],
        [0.7915, 1.0000, 0.7434],
        [0.9290, 0.7434, 1.0000]])


In [24]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Clustering with K-Means
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(embeddings)

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(embeddings, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")

  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Score: 0.2969221770763397


In [28]:
from sentence_transformers import SentenceTransformer, util

# Load the embedding model
model_name = './app/model/modules/multilingual-minilm'
model = SentenceTransformer(model_name)

# Sample question and context
question = "What is the capital of France?"
context = (
    "Paris, the capital of France, is known for its art, fashion, and culture. "
    "It is one of the most popular tourist destinations in the world."
)

# Compute embeddings
question_embedding = model.encode(question)
context_embedding = model.encode(context)

# Calculate cosine similarity
cosine_similarity = util.cos_sim(question_embedding, context_embedding)

print("Cosine Similarity between Question and Context:")
print(cosine_similarity)

Cosine Similarity between Question and Context:
tensor([[0.6925]])


In [5]:
from llama_index.core.evaluation import SemanticSimilarityEvaluator
from llama_index.core.embeddings import resolve_embed_model
from models import models, ModelName
model_name = models[ModelName.MULTILINGUAL_MINILM_FINETUNING_6.value]["local_dir"]

embed_model = resolve_embed_model(f"local:{model_name}")
evaluator = SemanticSimilarityEvaluator(
    embed_model=embed_model,
    similarity_threshold=0.6,
)
response = "The sky is yellow."
reference = "The sky is blue."

result = await evaluator.aevaluate(
    response=response,
    reference=reference,
)
print("Score: ", model_name, result.score)
print("Passing: ", result.passing)

Score:  ./app/model/modules/multilingual-minilm-finetuning-6 0.6160385790316049
Passing:  True


In [7]:
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from models import models, ModelName

for i in models:
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    # Load the embedding model
    model_name = models[i]["local_dir"]
    model = SentenceTransformer(model_name, device="cpu")

    # Sample questions and contexts
    questions = [
        "What is the capital of France?",
        "How many continents are there?",
        "Who wrote '1984'?"
    ]

    contexts = [
        "Paris, the capital of France, is known for its art, fashion, and culture. It is one of the most popular tourist destinations in the world.",
        "There are seven continents on Earth: Africa, Antarctica, Asia, Europe, North America, Australia, and South America.",
        "'1984' is a dystopian social science fiction novel and cautionary tale, written by the English writer George Orwell."
    ]

    # Ensure the lengths of questions and contexts are the same
    assert len(questions) == len(contexts), "The number of questions must match the number of contexts"

    # Compute embeddings for all questions and contexts
    with torch.no_grad():
        question_embeddings = model.encode(questions, convert_to_tensor=True)
        context_embeddings = model.encode(contexts, convert_to_tensor=True)

        # Perform element-wise multiplication for each question-context pair
        combined_embeddings = torch.mul(question_embeddings, context_embeddings)

        # Calculate cosine similarity matrix for the combined embeddings
        similarity_matrix = util.cos_sim(combined_embeddings, combined_embeddings).cpu().numpy()

    # Extract the upper triangular part of the similarity matrix, excluding the diagonal
    triu_indices = np.triu_indices_from(similarity_matrix, k=1)
    triu_values = similarity_matrix[triu_indices]

    # Calculate the average similarity score
    average_similarity_score = np.mean(triu_values)

    print(f"Model: {model_name}")
    print("Cosine Similarity Matrix for Combined Embeddings:")
    print(similarity_matrix)
    print("\nAverage Similarity Score (excluding diagonal):")
    print(average_similarity_score)
    print("\n" + "="*50 + "\n")

    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    del model

Model: ./app/model/modules/indo-sentence-bert-base
Cosine Similarity Matrix for Combined Embeddings:
[[1.         0.27892458 0.13558199]
 [0.27892458 0.99999994 0.17862284]
 [0.13558197 0.17862283 0.99999994]]

Average Similarity Score (excluding diagonal):
0.1977098


Model: ./app/model/modules/all-minilm
Cosine Similarity Matrix for Combined Embeddings:
[[0.9999999  0.33218047 0.2400226 ]
 [0.33218047 1.         0.31461537]
 [0.24002257 0.31461537 1.        ]]

Average Similarity Score (excluding diagonal):
0.29560614


Model: ./app/model/modules/mpnet-base-v2
Cosine Similarity Matrix for Combined Embeddings:
[[1.0000001  0.33152586 0.2571634 ]
 [0.33152586 1.         0.26290733]
 [0.25716344 0.26290733 1.0000001 ]]

Average Similarity Score (excluding diagonal):
0.28386554


Model: ./app/model/modules/multilingual-minilm
Cosine Similarity Matrix for Combined Embeddings:
[[1.         0.28806776 0.21676947]
 [0.28806776 1.         0.29232207]
 [0.21676949 0.29232204 1.        ]]

Aver