In [1]:
import torch

In [2]:

import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from tqdm import tqdm

# Устанавливаем random seed для воспроизводимости
np.random.seed(100)

# Загружаем датасет IMDB
dataset = load_dataset("imdb", split="train")



Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:

# Выбираем случайные 200 примеров из датасета
idx = np.random.randint(len(dataset), size=200)
subset = dataset.select(idx)

# Задаем токенизатор и модель
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Токенизация текста и подготовка DataLoader
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = subset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

dataloader = DataLoader(tokenized_dataset, batch_size=8)

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['label'].unsqueeze(1))
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}
        embeddings = model(**batch)['last_hidden_state'][:, 0, :]
        total_embeddings.append(embeddings.cpu())
    
    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

# Получаем эмбеддинги и метки
embeddings, labels = get_embeddings_labels(model, dataloader)

# Проверяем размерность тензора с эмбеддингами
print(embeddings.size())  # должно быть torch.Size([200, 768])

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import torch

# Сохраняем эмбеддинги и метки в файлы формата .pt
torch.save(embeddings, 'embeddings.pt')
torch.save(labels, 'labels.pt')

# Для загрузки:
# embeddings = torch.load('embeddings.pt')
# labels = torch.load('labels.pt')


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaModel
from datasets import load_dataset
from tqdm import tqdm

# Устанавливаем random seed для воспроизводимости
np.random.seed(100)

# Загружаем датасет IMDB
dataset = load_dataset("imdb", split="train")

# Выбираем случайные 200 примеров из датасета
idx = np.random.randint(len(dataset), size=200)
subset = dataset.select(idx)

# Задаем токенизатор и модель
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Токенизация текста и подготовка DataLoader
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = subset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

dataloader = DataLoader(tokenized_dataset, batch_size=8)

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['label'].unsqueeze(1))
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}
        embeddings = model(**batch)['last_hidden_state'][:, 0, :]
        total_embeddings.append(embeddings.cpu())
    
    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

# Получаем эмбеддинги и метки
embeddings2, labels = get_embeddings_labels(model, dataloader)

# Проверяем размерность тензора с эмбеддингами
print(embeddings2.size())  # должно быть torch.Size([200, 768])
torch.save(embeddings2, 'embeddings.pt')

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from datasets import load_dataset
from tqdm import tqdm

# Устанавливаем random seed для воспроизводимости
np.random.seed(100)

# Загружаем датасет IMDB
dataset = load_dataset("imdb", split="train")

# Выбираем случайные 200 примеров из датасета
idx = np.random.randint(len(dataset), size=200)
subset = dataset.select(idx)

# Задаем токенизатор и модель
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Токенизация текста и подготовка DataLoader
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = subset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

dataloader = DataLoader(tokenized_dataset, batch_size=8)

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []
    labels = []

    for batch in tqdm(loader):
        labels.append(batch['label'].unsqueeze(1))
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}
        with torch.no_grad():
            embeddings = model(**batch)[0][:, 0, :]  # DistilBERT возвращает только последний скрытый слой
        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

# Получаем эмбеддинги и метки
embeddings3, labels = get_embeddings_labels(model, dataloader)

# Проверяем размерность тензора с эмбеддингами
print(embeddings3.size())  # должно быть torch.Size([200, 768])
torch.save(embeddings3, "embendings3.pt")