In [1]:
from tqdm.notebook import tqdm

# import pandas as pd
# import polars as pl

import polars as pl
import torch
from torch.utils.data import Dataset
from sentence_transformers import SentenceTransformer


In [2]:
df_train = pl.read_csv("../data/1_clean/training.csv")
df_test = pl.read_csv("../data/1_clean/testing.csv")
df_validation = pl.read_csv("../data/1_clean/val.csv")

train_texts = df_train["text"].to_list()
train_labels = df_train["score"].to_list()

test_texts = df_test["text"].to_list()
test_labels = df_test["score"].to_list()

val_texts = df_validation["text"].to_list()
val_labels = df_validation["score"].to_list()


In [3]:
# # embed text
print("loading embedder")
model_name = "intfloat/multilingual-e5-large-instruct"
embedder = SentenceTransformer(model_name, device="cuda")


loading embedder


In [4]:
# # embed text
# print("loading embedder")
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# embedder = SentenceTransformer(model_name, device="cuda")


In [5]:
class CustomDataset(Dataset):
    def __init__(self, embeddings, labels):
        # embeddings: list of numpy arrays or torch tensors
        # labels: list of scalars
        self.X = torch.tensor(embeddings, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float32)  # or long, depending on your task

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [6]:
print("embedding train dataset")
train_embeddings = embedder.encode(train_texts, batch_size=16, convert_to_numpy=True, show_progress_bar=True)

embedding train dataset


Batches:   0%|          | 0/651 [00:00<?, ?it/s]

In [7]:
print("embedding test dataset")
test_embeddings = embedder.encode(test_texts, batch_size=16, convert_to_numpy=True,show_progress_bar=True)

embedding test dataset


Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [8]:
print("embedding val dataset")
val_embeddings = embedder.encode(val_texts, batch_size=16, convert_to_numpy=True,show_progress_bar=True)

embedding val dataset


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [9]:
train_dataset = CustomDataset(train_embeddings, train_labels)
test_dataset = CustomDataset(test_embeddings, test_labels)
val_dataset = CustomDataset(val_embeddings, val_labels)


In [10]:
# torch.save(train_dataset, '../data/2_ready_for_training/embedded/train_dataset_small.pth')
# torch.save(test_dataset, '../data/2_ready_for_training/embedded/test_dataset_small.pth')
# torch.save(val_dataset, '../data/2_ready_for_training/embedded/val_dataset_small.pth')


In [11]:
torch.save(train_dataset, '../data/2_ready_for_training/embedded/train_dataset_large.pth')
torch.save(test_dataset, '../data/2_ready_for_training/embedded/test_dataset_large.pth')
torch.save(val_dataset, '../data/2_ready_for_training/embedded/val_dataset_large.pth')

