In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
from sentence_transformers import (
    SentenceTransformer,
)
import pandas as pd
from src.evaluate_bert import get_embeddings
from functools import partial
from scipy.spatial.distance import cosine


In [4]:
df_eng = pd.read_csv('semrel_baselines/data/eng/eng_dev.csv')

In [8]:
# select those rows where Score ==0.19
df_tmp = df_eng[df_eng['Score']==0.19]

In [10]:
sentence_1, sentence_2 = df_tmp['Text'].iloc[0].split('\n')

In [13]:
print(sentence_1)
print(sentence_2)

Two women standing by a table of vegetables.
Two dirt bike rides during a competition.


In [14]:
model = SentenceTransformer('sentence-transformers/LaBSE')

In [20]:
model_finetuned = SentenceTransformer('semrel_baselines/models/finetuned_labse')

In [15]:
embeddings = model.encode([sentence_1, sentence_2])

In [8]:
# calculate cosine similarity between two embeddings

NameError: name 'embeddings' is not defined

In [25]:
embeddings_finetuned = model_finetuned.encode([sentence_1, sentence_2])
embeddings_1_finetuned, embeddings_2_finetuned = embeddings_finetuned
1 - cosine(embeddings_1_finetuned, embeddings_2_finetuned)

0.23601487278938293

In [22]:
bert_embeddings = get_embeddings(model_name='bert-base-uncased', sentences=[sentence_1, sentence_2])

In [26]:
bert_embedding_1, bert_embedding_2 = bert_embeddings
1-cosine(bert_embedding_1, bert_embedding_2)

0.6278668642044067

In [None]:
# randomly select 2-3 examples from df_eng
df_temp = df_eng.sample(n=3)


In [5]:
def dump_cosine_scores(model_name, model_type, df):
    df_fin = pd.DataFrame()
    model_func = None
    if model_type == "sbert":
        model_func = SentenceTransformer(model_name)
    elif model_type == "bert":
        model_func = partial(get_embeddings, model_name=model_name)
    else:
        raise ValueError("model_type should be either sbert or bert")
    for index, row in df.iterrows():
        sentence_1, sentence_2 = row["Text"].split("\n")
        if model_type == "bert":
            embeddings = model_func(sentences=[sentence_1, sentence_2])
        elif model_type == "sbert":
            embeddings = model_func.encode([sentence_1, sentence_2])
        embedding_1, embedding_2 = embeddings
        score = 1 - cosine(embedding_1, embedding_2)
        df_tmp = pd.DataFrame(
            {"Text": row["Text"], "Score": row["Score"], "cosine_similarity": score},
            index=[0],
        )
        df_fin = pd.concat([df_fin, df_tmp], ignore_index=True)

    return df_fin
    # df_fin = df_fin.append({'Score': score}, ignore_index=True)

In [34]:
df_eng_selected = df_eng.sample(n=10)
df_fin_labse = dump_cosine_scores(model_name='sentence-transformers/LaBSE', model_type='sbert', df=df_eng_selected)
df_fin_labse_finetuned = dump_cosine_scores(model_name='semrel_baselines/models/finetuned_labse', model_type='sbert', df=df_eng_selected)
df_fin_bert = dump_cosine_scores(model_name='bert-base-uncased', model_type='bert', df=df_eng_selected)

In [37]:
df_fin_labse.to_csv('semrel_baselines/analysis_files/eng_labse.csv', index=False)
df_fin_labse_finetuned.to_csv('semrel_baselines/analysis_files/eng_labse_finetuned.csv', index=False)
df_fin_bert.to_csv('semrel_baselines/analysis_files/eng_bert.csv', index=False)

In [3]:
def create_analysis_files(df_path, bert_model_name, lang_code):
    df = pd.read_csv(df_path)
    df_selected = df.sample(n=10)
    df_fin_labse = dump_cosine_scores(
        model_name="sentence-transformers/LaBSE", model_type="sbert", df=df_selected
    )
    df_fin_labse_finetuned = dump_cosine_scores(
        model_name="semrel_baselines/models/finetuned_labse",
        model_type="sbert",
        df=df_selected,
    )
    df_fin_lang_bert = dump_cosine_scores(
        model_name=bert_model_name, model_type="bert", df=df_selected
    )

    df_fin_labse.to_csv(
        f"semrel_baselines/analysis_files/{lang_code}_labse.csv",
        index=False,
    )
    df_fin_labse_finetuned.to_csv(
        f"semrel_baselines/analysis_files/{lang_code}_labse_finetuned.csv",
        index=False,
    )
    df_fin_lang_bert.to_csv(
        f"semrel_baselines/analysis_files/{lang_code}_bert.csv",
        index=False,
    )

In [10]:
create_analysis_files('semrel_baselines/data/pan/pan_dev.csv', 'bert-base-multilingual-cased', 'pan')