In [1]:
import os
import json
import gzip
import pandas as pd
from src.embeddings import ModelRegistry
from src.embeddings import EmbeddingPipeline

In [2]:
considered_models = list(ModelRegistry.list_models().keys())
considered_models

['multilingual-e5-large',
 'bge-m3',
 'gte-multilingual-base',
 'jina-embeddings-v3',
 'snowflake-arctic-embed-l-v2.0',
 'labse',
 'use-multilingual',
 'xlm-roberta-large']

In [3]:
data_root = "../data/multipride_data/"
figures_root = "../figures/"
embeddings_root = "../embeddings/"
os.makedirs(figures_root, exist_ok=True)
os.makedirs(embeddings_root, exist_ok=True)

train_files = [file for file in os.listdir(data_root) if (file.endswith(".csv") and ("train" in file))]
train_files

['train_es.csv', 'train_it.csv', 'train_en.csv']

In [4]:
train_df = pd.DataFrame()

for file in train_files:
    temp_df = pd.read_csv(os.path.join(data_root, file))
    if "en" in file:
        temp_df["bio"] = [None] * temp_df.shape[0]
    train_df = pd.concat([train_df, temp_df], ignore_index=True)

print(f"Total training samples: {train_df.shape[0]}")

Total training samples: 2988


In [5]:
train_df.head()

Unnamed: 0,id,text,bio,label,lang
0,es_1850,28 de Junio - Día Internacional del Orgullo LG...,Doblajes Para Videojuegos que nunca tuvieron D...,0,es
1,es_773,"@USER no me gusta la Montero, por su apoyo a l...","Activista, sindicalista, madre y parte de la R...",0,es
2,es_1899,Es la semana del #GayPride y la dedicaré al #Q...,Pintor daltónico que habla de arte. Confundo e...,0,es
3,es_685,@USER @USER @USER A la carles vais los #TRANSF...,mujer Algemesí Valencia Telegram @USER\n+34 62...,0,es
4,es_1717,"Hoy a las 00:10 en TVE2, estreno del documenta...",Comunidad LGTBI+ sin ánimo de lucro. Reivindic...,0,es


In [6]:
texts = list(train_df.text)
ids = list(train_df.id)
labels = [int(l) for l in list(train_df.label)]

In [7]:
pipeline = None
for i, considered_model in enumerate(considered_models):
    gzip_path = os.path.join(embeddings_root, considered_model + ".json.gz")

    if not os.path.exists(gzip_path):
        if pipeline is None:
            pipeline = EmbeddingPipeline(model_key=considered_model)
        else:
            pipeline.switch_model(considered_model)
            
        text_embeddings = pipeline.encode(texts, batch_size=32, show_progress_bar=True)
        text_embeddings = text_embeddings.tolist()
    
        embeddings_dict = {}
        for text_id, emb, label in zip(ids, text_embeddings, labels):
            embeddings_dict[text_id] = {"emb": emb, "label": label}
    
        with gzip.open(gzip_path, 'wt', encoding='utf-8') as f:
            json.dump(embeddings_dict, f)
    

Loading Multilingual E5 Large (intfloat/multilingual-e5-large)...
Model: Multilingual E5 Large
Device: cuda
Embedding Dimension: 1024
Max Sequence Length: 512


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'multilingual-e5-large' to 'bge-m3'...
Model: BGE-M3
Device: cuda
Embedding Dimension: 1024
Max Sequence Length: 8192


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'bge-m3' to 'gte-multilingual-base'...


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model: GTE Multilingual Base
Device: cuda
Embedding Dimension: 768
Max Sequence Length: 8192


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'gte-multilingual-base' to 'jina-embeddings-v3'...


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention impl

Model: Jina Embeddings v3
Device: cuda
Embedding Dimension: 1024
Max Sequence Length: 8192


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'jina-embeddings-v3' to 'snowflake-arctic-embed-l-v2.0'...
Model: Arctic Embed 2.0 Large
Device: cuda
Embedding Dimension: 1024
Max Sequence Length: 2048


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'snowflake-arctic-embed-l-v2.0' to 'labse'...
Model: LaBSE
Device: cuda
Embedding Dimension: 768
Max Sequence Length: 512


Batches:   0%|          | 0/94 [00:00<?, ?it/s]


Switching model from 'labse' to 'use-multilingual'...


Some weights of the model checkpoint at sentence-transformers/use-cmlm-multilingual were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model: Universal Sentence Encoder Multilingual
Device: cuda
Embedding Dimension: 768
Max Sequence Length: 512


Batches:   0%|          | 0/94 [00:00<?, ?it/s]

No sentence-transformers model found with name FacebookAI/xlm-roberta-large. Creating a new one with mean pooling.



Switching model from 'use-multilingual' to 'xlm-roberta-large'...
Model: XLM-RoBERTa model
Device: cuda
Embedding Dimension: 1024
Max Sequence Length: 512


Batches:   0%|          | 0/94 [00:00<?, ?it/s]