# Testing LLM Query Outputs with Cosine Similarity

In [None]:
%pip install sentence-transformers scikit-learn

In [56]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# We are using one of the very popular small yet efficient Transformer model for computing embeddings for our texts. 
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text: str) -> np.ndarray:
    """
    Returns the embedding vector for the given text.
    """
    embedding = model.encode([text])
    return embedding

def compute_cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Computes the cosine similarity between two vectors. Returns a value between -1 and 1.
    """
    return cosine_similarity(vec1, vec2)[0][0]

def simulate_llm_output(query: str) -> str:
    """
    Simulates an LLM query response. In a real-world scenario, this function would call an LLM API.
    """
    # For demonstration, we return a predefined response based on the query content
    if "drawbacks" in query:
        return "Dining outside exposes you to unpredictable weather and increases the risk of foodborne illnesses"
    elif "negative" in query:
        return "Eating outdoors subjects you to variable weather conditions and raises the potential for foodborne diseases"
    elif "positive" in query:
        return "Dining outdoors can improve your mood and offer a refreshing change from routine indoor meals."
    else:
        return "I am sorry, I do not have information on that."

def interpret_similarity(cos_sim: float) -> str:
    """
    Interprets the cosine similarity value (ranging from -1 to 1) and returns a description
    of the relationship between two texts.

    Parameters:
        cos_sim (float): Cosine similarity value between -1 and 1.

    Returns:
        str: A string description indicating if the texts are highly similar, not related, or opposite.
    """
    # Define threshold values for interpretation
    if cos_sim >= 0.8:
        return "The texts are highly similar."
    elif cos_sim <= -0.8:
        return "The texts are opposite."
    else:
        return "The texts are not closely related."



In [57]:
# Test1 : if we give 2 similar queries, LLM output should be similar.
query_similar_1 = "What are the drawbacks of eating outside?"
query_similar_2 = "What negative aspects come with outdoor dining?"
print(f"Query 1: {query_similar_1}")
print(f"Query 2: {query_similar_2}")
output_similar_1 = simulate_llm_output(query_similar_1)
output_similar_2 = simulate_llm_output(query_similar_2)
print(f"Output for query_similar_1: {output_similar_1}")
print(f"Output for query_similar_2: {output_similar_2}")
embedding_similar_1 = get_embedding(output_similar_1)
embedding_similar_2 = get_embedding(output_similar_2)
similarity_similar = compute_cosine_similarity(embedding_similar_1, embedding_similar_2)
print(f"Cosine similarity between the outputs: {similarity_similar}")
print(interpret_similarity(similarity_similar))


Query 1: What are the drawbacks of eating outside?
Query 2: What negative aspects come with outdoor dining?
Output for query_similar_1: Dining outside exposes you to unpredictable weather and increases the risk of foodborne illnesses
Output for query_similar_2: Eating outdoors subjects you to variable weather conditions and raises the potential for foodborne diseases
Cosine similarity between the outputs: 0.8508991003036499
The texts are highly similar.


In [58]:
# Test1 : if we give 2 opposite / unrelated queries, LLM output should not be similar.
query1 = "What are the drawbacks of eating outside?"
query2 = "What positive aspects come with outdoor dining?"
print(f"Query 1: {query1}")
print(f"Query 2: {query2}")
output1 = simulate_llm_output(query1)
output2 = simulate_llm_output(query2)
print(f"Output for query1: {output1}")
print(f"Output for query2: {output2}")
embedding1 = get_embedding(output1)
embedding2 = get_embedding(output2)
similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between the outputs: {similarity}")
print(interpret_similarity(similarity))

Query 1: What are the drawbacks of eating outside?
Query 2: What positive aspects come with outdoor dining?
Output for query1: Dining outside exposes you to unpredictable weather and increases the risk of foodborne illnesses
Output for query2: Dining outdoors can improve your mood and offer a refreshing change from routine indoor meals.
Cosine similarity between the outputs: 0.6589205265045166
The texts are not closely related.


In [None]:
# Test3 : If we give something like below, LLM output should not be opposite.
output1 = "She is really excited about her promotion"
output2 = "She is not really excited about her promotion"
print(f"Output for query1: {output1}")
print(f"Output for query2: {output2}")
embedding1 = get_embedding(output1)
embedding2 = get_embedding(output2)
similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between the outputs: {similarity}")
print(interpret_similarity(similarity))

Output for query1: She is really excited about her promotion
Output for query2: She is not really excited about her promotion
Cosine similarity between the outputs: 0.9077736139297485
The texts are highly similar.


#### LLM lacks to understand deeper meaning of the 2 sentences and hence it is giving high similarity, so some limitation.

In [None]:
# Test4 : If we give something like below to test nuances or emotions, LLM output should not be opposite.
output1 = "Oh great, another rainy day!!"
output2 = "Oh great, another day at the beach, I am so lucky"
print(f"Output for query1: {output1}")
print(f"Output for query2: {output2}")
embedding1 = get_embedding(output1)
embedding2 = get_embedding(output2)
similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between the outputs: {similarity}")
print(interpret_similarity(similarity))

Output for query1: Oh great, another rainy day!!
Output for query2: Oh great, another day at the beach, I am so lucky
Cosine similarity between the outputs: 0.5919630527496338
The texts are not closely related.


#### LLM lacks to understand nuances, or emotions so another limitation.

## Conclusion

This notebook illustrates a metamorphic testing approach for LLM query outputs using cosine similarity. By simulating LLM responses for different queries and comparing their semantic similarity, we can verify whether the models outputs adhere to the expected relationships-without relying on fixed or deterministic expected outputs.

Such an approach is especially useful when working with non-deterministic LLM outputs where traditional testing methods may fall short.

 However, they have some limitations as mentioned above
 - Lack to understand deeper meaning of 2 texts.
 - Lack to understand emotions.
