In [181]:
from typing import Any
import numpy as np
from docuverse.utils import detect_device
from sentence_transformers import SentenceTransformer

In [None]:
def create_st_model(model_name, attn="sdpa") -> SentenceTransformer:
    model_args = {}
    if attn.find("flash") >= 0:
        model_args: dict[str, Any] = {"attn_implementation": attn}
        import torch
        model_args["torch_dtype"] = torch.bfloat16
    device = detect_device()
    # Initialize the sentence transformer model
    model = SentenceTransformer(model_name, device=device,
                                model_kwargs=model_args,
                                trust_remote_code=True)
    return model

In [162]:
granite_model_r2 = create_st_model("ibm-granite/granite-embedding-english-r2", "sdpa")
granite_model_r1 = create_st_model("ibm-granite/granite-embedding-125m-english", "sdpa")
granite_model_r2s = create_st_model("ibm-granite/granite-embedding-small-english-r2", "flash_attention_2")
granite_model_r2s_sdpa = create_st_model("ibm-granite/granite-embedding-small-english-r2", "sdpa")
gte_mbert = create_st_model("Alibaba-NLP/gte-modernbert-base", "flash_attention_2")

README.md: 0.00B [00:00, ?B/s]

In [156]:
text = ["This is some trial for stuff. How are we doing?",
        "Today is Monday, it's going to be a long week.",
        "The weather outside is beautiful and sunny today.",
        "I need to finish this project before the deadline tomorrow."
        ]
def test_granite_model(model):
    res1 = model.encode(text, batch_size=1, show_progress_bar=True, normalize_embeddings=True)
    res2 = model.encode(text, batch_size=2, show_progress_bar=True, normalize_embeddings=True)
    res4 = model.encode(text, batch_size=4, show_progress_bar=True, normalize_embeddings=True)
    return res1, res2, res4

In [54]:
def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors using only numpy."""
    # Compute dot product
    dot_product = np.dot(vec1, vec2)

    # Compute norms (magnitudes)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    # Compute cosine similarity
    cosine_sim = dot_product / (norm1 * norm2)

    return cosine_sim


In [163]:
r1_res = test_granite_model(granite_model_r1)
r2_res = test_granite_model(granite_model_r2)
gte_res = test_granite_model(gte_mbert)
r2s_res = test_granite_model(granite_model_r2s)
r2s_res_sdpa = test_granite_model(granite_model_r2s_sdpa)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

  return torch._C._get_cublas_allow_tf32()


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Looking at the user's request to add cosine similarity to the embedding comparison function, I need to modify the `compare_embeddings` function to include cosine similarity calculation.



In [91]:

def compare_embeddings(emb1, emb2):
    """Compare two embedding arrays and return comparison statistics."""
    # Check if res1 and res2 are exactly the same
    are_identical = np.array_equal(emb1, emb2)
    print(f"Are embeddings identical? {are_identical}")

    # Check if they are close within a tolerance (useful for floating point comparisons)
    are_close = np.allclose(emb1, emb2, rtol=1e-05, atol=1e-07)
    print(f"Are embeddings close (within tolerance)? {are_close}")

    # Calculate the maximum absolute difference
    max_diff = np.max(np.abs(emb1 - emb2))
    print(f"Maximum absolute difference: {max_diff}")

    # Calculate mean absolute difference
    mean_diff = np.sum(np.abs(emb1 - emb2)) / len(emb1)
    print(f"L1 norm of the difference: {mean_diff}")

    # Calculate L2 norm (Euclidean norm) of the difference
    norm2_diff = np.linalg.norm(emb1 - emb2)
    print(f"L2 norm of the difference: {norm2_diff}")

    # Calculate cosine similarity between embeddings
    cs = [cosine_similarity(e1, e2) for e1, e2 in zip(emb1, emb2)]
    cos_sim = np.sum(cs) / len(cs)

    # Calculate mean cosine similarity (diagonal values for same-index comparisons)
    # mean_cos_sim = np.mean(np.diag(cos_sim))
    print(f"Mean cosine similarity: {cos_sim}\nCosine similarities: {[float(i) for i in cs]}")

    # return are_identical, are_close, max_diff, mean_diff, norm2_diff, cos_sim #, mean_cos_sim


In [92]:
compare_embeddings(gte_res[0], gte_res[1])

Are embeddings identical? False
Are embeddings close (within tolerance)? False
Maximum absolute difference: 0.011104419827461243
L1 norm of the difference: 1.9403321743011475
L2 norm of the difference: 0.1759977638721466
Mean cosine similarity: 0.9999974370002747
Cosine similarities: [0.9999982118606567, 0.999997615814209, 0.9999970197677612, 0.9999968409538269]


In [93]:
compare_embeddings(r2_res[0], r2_res[1])

Are embeddings identical? False
Are embeddings close (within tolerance)? False
Maximum absolute difference: 0.007903128862380981
L1 norm of the difference: 1.2724721431732178
L2 norm of the difference: 0.11545548588037491
Mean cosine similarity: 0.9999982118606567
Cosine similarities: [0.9999983310699463, 0.9999976754188538, 0.9999985098838806, 0.9999984502792358]


Looking at the user's request, they want a function that computes statistics comparing different batch size results (0 vs 1, 1 vs 2, and 0 vs 2) from vectors like `r2_res` and puts them in a pandas DataFrame.



In [173]:
import pandas as pd


def compute_batch_statistics(results_tuple, model_name):
    """
    Compute statistics for different batch size comparisons and return as pandas DataFrame.
    
    Args:
        results_tuple: Tuple containing (batch_size_1_results, batch_size_2_results, batch_size_4_results)
    
    Returns:
        pandas.DataFrame: DataFrame with comparison statistics
    """
    res1, res2, res4 = results_tuple

    # Define comparisons
    comparisons = [
        ("1_vs_2", res1, res2),
        ("2_vs_4", res2, res4),
        ("1_vs_4", res1, res4)
    ]

    stats_list = []

    for comparison_name, emb1, emb2 in comparisons:
        # Calculate statistics
        are_identical = np.array_equal(emb1, emb2)
        are_close = np.allclose(emb1, emb2, rtol=1e-05, atol=1e-07)
        max_diff = np.max(np.abs(emb1 - emb2))
        mean_diff = np.mean(np.abs(emb1 - emb2))
        l2_norm_diff = np.linalg.norm(emb1 - emb2)
        l1_diff_norm = np.mean([np.linalg.norm(e1 - e2, ord=1) for e1, e2 in zip(emb1, emb2)])
        l2_diff_norm = np.mean([np.linalg.norm(e1 - e2, ord=2) for e1, e2 in zip(emb1, emb2)])

        # l1_diff_norm = np.sum(np.abs(emb1-emb2))

        # Calculate cosine similarities for each text embedding
        cos_similarities = [cosine_similarity(e1, e2) for e1, e2 in zip(emb1, emb2)]
        mean_cos_sim = np.mean(cos_similarities)
        min_cos_sim = np.min(cos_similarities)
        max_cos_sim = np.max(cos_similarities)

        # Append statistics to list
        stats_list.append({
            'model_name': model_name,
            'batch size': comparison_name,
            # 'identical': are_identical,
            # 'close_within_tolerance': are_close,
            'max_absolute_diff': max_diff,
            # 'mean_absolute_diff': mean_diff,
            'l1_diff': l1_diff_norm,
            'l2_diff': l2_diff_norm,
            'mean_cosine_similarity': mean_cos_sim,
            # 'min_cosine_similarity': min_cos_sim,
            # 'max_cosine_similarity': max_cos_sim,
            # 'cosine_similarities': cos_similarities
        })

    # Create DataFrame
    df = pd.DataFrame(stats_list)
    return df


# Test the function with r2_res
r2_stats = compute_batch_statistics(r2_res, "ge_r2")
r2_stats


Unnamed: 0,model_name,batch size,max_absolute_diff,l1_diff,l2_diff,mean_cosine_similarity
0,ge_r2,1_vs_2,0.000259,0.041605,0.001881,0.999998
1,ge_r2,2_vs_4,0.000283,0.038611,0.001745,0.999998
2,ge_r2,1_vs_4,0.000234,0.040601,0.001848,0.999998


In [175]:
r2_stats = compute_batch_statistics(r2_res, "ge_r2")
results = {"ge_r1": r1_res,
           "ge_r2s": r2s_res,
           "ge_r2s_sdpa": r2s_res_sdpa,
           "gte": gte_res
           }
for name, res in results.items():
    r2_stats = pd.concat([r2_stats, compute_batch_statistics(res, name)], ignore_index=True)

r2_stats


Unnamed: 0,model_name,batch size,max_absolute_diff,l1_diff,l2_diff,mean_cosine_similarity
0,ge_r2,1_vs_2,0.0002593286,0.041605,0.001880517,0.999998
1,ge_r2,2_vs_4,0.0002826452,0.038611,0.001744506,0.999998
2,ge_r2,1_vs_4,0.0002336079,0.040601,0.001847949,0.999998
3,ge_r1,1_vs_2,8.940697e-08,6e-06,2.923165e-07,1.0
4,ge_r1,2_vs_4,5.960464e-08,7e-06,3.055415e-07,1.0
5,ge_r1,1_vs_4,5.960464e-08,7e-06,3.059082e-07,1.0
6,ge_r2s,1_vs_2,0.00390625,0.119915,0.008263242,0.999973
7,ge_r2s,2_vs_4,0.0,0.0,0.0,1.0
8,ge_r2s,1_vs_4,0.00390625,0.119915,0.008263242,0.999973
9,ge_r2s_sdpa,1_vs_2,0.000260381,0.023137,0.0014721,0.999999


In [180]:
compute_batch_statistics(gte_res, "gte")

Unnamed: 0,model_name,batch size,max_absolute_diff,l1_diff,l2_diff,mean_cosine_similarity
0,gte,1_vs_2,0.001953,0.203721,0.009682,0.999953
1,gte,2_vs_4,0.001953,0.228249,0.010815,0.999942
2,gte,1_vs_4,0.003906,0.222909,0.010637,0.999944


In [147]:
np.linalg.norm(r2_res[0][0]-r2_res[1][0], ord=1)

np.float32(1.2654071)

In [148]:
aa[:10]*aa[:10]

array([3.8146973e-06, 1.5258789e-05, 0.0000000e+00, 9.7656250e-04,
       3.4332275e-05, 2.2912025e-04, 6.1035156e-05, 0.0000000e+00,
       0.0000000e+00, 6.1035156e-05], dtype=float32)

In [149]:
aa[:10]

array([0.00195312, 0.00390625, 0.        , 0.03125   , 0.00585938,
       0.01513672, 0.0078125 , 0.        , 0.        , 0.0078125 ],
      dtype=float32)

In [150]:
1.5604496e-03**2

2.43500295414016e-06

In [159]:
np.linalg.norm(gte_res[0][0], ord=2)

np.float32(1.0012155)