## Hugging Face

Hugging face emebddings. The cells should be run only once to download the embedding into the respective files.
We only collect the German items here.

In [1]:
# required packages
# !pip install pandas
# !pip install haystack-ai

# common imports
import os
import pandas as pd

data = pd.read_csv('scales.csv', encoding='unicode_escape')
# additional check on the scales converted here
data = data[data['scaleID'].isin(['PID', 'NEO'])]

# if the scales items are language specific, rename the column to 
# 'item' and drop the other language column
if 'item_de' in data.columns:
    data = data.rename(columns={'item_de': 'item'})
    data = data.drop(columns=['item_en'])

## Roberta

In [None]:
# required imports
# !pip install sentence-transformers

# multilingual roberta model. This does not work under windows, use google colab instead
# check if the embedding file exists
if os.path.exists('embeddings_roberta_multilingual_de.csv'):
    # raise error if the file exists
    raise FileExistsError('The file embeddings_roberta_multilingual_de.csv ' + 
        'already exists. Please remove it before running this script again.')

# multilingual roberta model
from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("tomaarsen/xlm-roberta-base-multilingual-en-ar-fr-de-es-tr-it")
# Run inference
sentences = data.item.tolist()
embeddings = model.encode(sentences)

# save embeddings to file
data['embedding'] = embeddings.tolist()
data.to_csv('embeddings_roberta_multilingual_de.csv', index=False)


In [None]:
# This is the Enlish version of roberta. This works fine under windows.

# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

# check if the 'embeddings_roberta_en.csv' file exists
if os.path.exists('embeddings_roberta_en.csv'):
    # raise error if the file exists
    raise FileExistsError('The file embeddings_roberta_de.csv ' + 
        'already exists. Please remove it before running this script again.')

# retrieve embeddings from roberta from hugging face
# https://huggingface.co/sentence-transformers/all-roberta-large-v1
text_embedder = HuggingFaceTEITextEmbedder(
    model="sentence-transformers/all-roberta-large-v1", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]

# save embeddings to the 'scales_roberta_en.csv' file
data['embedding'] = embeddings
data.to_csv('embeddings_roberta_en.csv', index=False)


## Other Hugging Face models

In [None]:
# A standard open source model

# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

# a model recommended for its performance (https://www.sbert.net/docs/pretrained_models.html)
text_embedder = HuggingFaceTEITextEmbedder(
    model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]

# save embeddings to file
data['embedding'] = embeddings
data.to_csv('embeddings_mpnetbasev2_de.csv', index=False)


In [2]:
# Another standard open source model

# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

# a model recommended by OpenSearch
text_embedder = HuggingFaceTEITextEmbedder(
    model="sentence-transformers/distiluse-base-multilingual-cased-v1", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]

# save embeddings to file
data['embedding'] = embeddings
data.to_csv('embeddings_distiluse_cased_v1_de.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Another open source model

# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

text_embedder = HuggingFaceTEITextEmbedder(
    model="BAAI/bge-m3", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]

# save embeddings to file
data['embedding'] = embeddings
data.to_csv('embeddings_BGEM3_de.csv', index=False)


In [11]:
# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

text_embedder = HuggingFaceTEITextEmbedder(
    model="intfloat/multilingual-e5-large-instruct", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]
data['embedding'] = embeddings
data.to_csv('embeddings_multilingual_e5_large_instruct_de.csv', index=False)


In [10]:
# required pakcages
# !pip install haystack-ai
# !pip install huggingface-hub

from haystack.components.embedders import HuggingFaceTEITextEmbedder
from haystack.utils import Secret

text_embedder = HuggingFaceTEITextEmbedder(
    model="jinaai/jina-embeddings-v2-base-de", 
    token=Secret.from_token(os.environ['HF_API_KEY'])
)

embeddings = [text_embedder.run(item)['embedding'] for item in data.item]
data['embedding'] = embeddings
data.to_csv('embeddings_jinav2base_de.csv', index=False)
