In [1]:

import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from xl_durel_utils.core import tokenize_truncate_decode, calculate_spearman, calculate_krippendorff, plot
from sentence_transformers import SentenceTransformer, models
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model and tokenizer

model = SentenceTransformer("sachinn1/xl-durel")
tokenizer = AutoTokenizer.from_pretrained("sachinn1/xl-durel")


In [None]:
# Load the datasets

dev_df = pd.read_pickle("dev.pkl")
test_df = pd.read_pickle("test.pkl")



In [4]:

# Compute cosine similarities
def compute_similarity(df):
    similarities = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing similarities"):
        sent1 = row["sentence1"]
        sent2 = row["sentence2"]
        pos1 = row["position1"]
        pos2 = row["position2"]

        try:
            context1 = tokenize_truncate_decode(sent1, pos1, tokenizer, max_seq_len=128)
            context2 =tokenize_truncate_decode(sent2, pos2, tokenizer, max_seq_len=128)
            emb1 = model.encode(context1, convert_to_tensor=True)
            emb2 = model.encode(context2, convert_to_tensor=True)
            
            sim = cosine_similarity(
                emb1.unsqueeze(0).cpu().numpy(),
                emb2.unsqueeze(0).cpu().numpy()
            )[0][0]

        except Exception as e:
            print(f"Failed : {e}")

        similarities.append(sim)

    df = df.copy()
    df["similarity"] = similarities
    return df


dev_df_with_sim = compute_similarity(dev_df)
test_df_with_sim = compute_similarity(test_df)


Computing similarities:   0%|          | 0/13925 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Computing similarities:  40%|████      | 5634/13925 [01:44<02:29, 55.46it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (134 > 128). Running this sequence through the model will result in indexing errors
Computing similarities: 100%|██████████| 13925/13925 [04:20<00:00, 53.45it/s]
Computing similarities: 100%|██████████| 21732/21732 [06:47<00:00, 53.35it/s]


In [5]:
df1 =calculate_spearman(test_df_with_sim, ["dataset", "language"])  
df2 = calculate_krippendorff(dev_df_with_sim, test_df_with_sim, ["dataset", "language"])
display(df1)
display(df2)


('comedi', 'chinese') [-inf, 0.576806630812142, 0.6765000612679907, 0.7927114823752207, inf]
('comedi', 'english') [-inf, 0.32535891662107164, 0.48300447544899106, 0.6115888364510855, inf]
('comedi', 'german') [-inf, 0.3299749532136552, 0.46455515154726945, 0.5999736991576248, inf]
('comedi', 'norwegian') [-inf, 0.20956338471740604, 0.3385892552575941, 0.48845441548495633, inf]
('comedi', 'russian') [-inf, 0.25538998762788334, 0.490667532441681, 0.6147419051133813, inf]
('comedi', 'spanish') [-inf, 0.29708450228418737, 0.520818010559206, 0.6276640118867141, inf]
('comedi', 'swedish') [-inf, 0.2898122767288841, 0.45185368542721416, 0.5643789065263318, inf]
('mcl-wic', 'arabic') [-inf, 0.6343779718037692, inf]
('mcl-wic', 'chinese') [-inf, 0.7655562194762747, inf]
('mcl-wic', 'english') [-inf, 0.6679675322375264, inf]
('mcl-wic', 'french') [-inf, 0.6227152441162618, inf]
('mcl-wic', 'russian') [-inf, 0.5935577353578995, inf]
('wic', 'english') [-inf, 0.5514793525217101, inf]


Unnamed: 0,dataset,language,pearson,spearman
0,comedi,chinese,0.454538,0.392829
1,comedi,english,0.755052,0.749389
2,comedi,german,0.759791,0.755932
3,comedi,norwegian,0.665988,0.45214
4,comedi,russian,0.715791,0.606997
5,comedi,spanish,0.808046,0.741644
6,comedi,swedish,0.743735,0.609296
7,mcl-wic,arabic,0.721826,0.731314
8,mcl-wic,chinese,0.728834,0.732658
9,mcl-wic,english,0.82852,0.818242


Unnamed: 0,dataset,language,krippendorff_alpha
0,comedi,chinese,0.444489
1,comedi,english,0.733866
2,comedi,german,0.738941
3,comedi,norwegian,0.649443
4,comedi,russian,0.656424
5,comedi,spanish,0.759724
6,comedi,swedish,0.685712
7,mcl-wic,arabic,0.675928
8,mcl-wic,chinese,0.692094
9,mcl-wic,english,0.836066
