In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertConfig
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Функции для предобработки данных
def preprocess_function1(examples, tokenizer):
    return tokenizer(examples['sentence' if 'sentence' in examples else 'text'], 
                     padding='max_length', truncation=True, max_length=128)

def preprocess_function2(examples, tokenizer):
    return tokenizer(examples['text'], return_special_tokens_mask=True, padding='max_length', truncation=True, max_length=128)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Загрузка и предобработка датасетов
glue_dataset = load_dataset('glue', 'sst2')
wikipedia_dataset = load_dataset('wikipedia', '20220301.en', split='train[:0.1%]')
bookcorpus_dataset = load_dataset('bookcorpus', split='train[:0.1%]')
combined_dataset = concatenate_datasets([wikipedia_dataset, bookcorpus_dataset])

glue_encoded_dataset = glue_dataset.map(lambda examples: preprocess_function1(examples, tokenizer), batched=True)
combined_encoded_dataset = combined_dataset.map(lambda examples: preprocess_function2(examples, tokenizer), batched=True)

# Создание конфигурации для меньшей модели с 6 слоями
small_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers=6)

# Загрузка предварительно обученной модели BERT и копирование первых 6 слоев
large_model = BertModel.from_pretrained('bert-base-uncased')
model = BertModel(small_config)

# Копирование весов
model = model.from_pretrained('bert-base-uncased', config=small_config)

for i in range(6):
    model.encoder.layer[i].load_state_dict(large_model.encoder.layer[i].state_dict())
model.to(device)

# Функция для подсчета параметров модели
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    
    return {'total_params': total_params, 'trainable_params': trainable_params, 'non_trainable_params': non_trainable_params}

# Подсчет и вывод параметров основной модели BERT
bert_params = count_parameters(model)
print(f"BERT Model Parameters: Total: {bert_params['total_params']}, Trainable: {bert_params['trainable_params']}, Non-Trainable: {bert_params['non_trainable_params']}")

vocab_size = tokenizer.vocab_size
predictor = nn.Linear(small_config.hidden_size, vocab_size).to(device)

# Создание DataLoader для датасетов
train_loader = DataLoader(combined_encoded_dataset, batch_size=8, shuffle=True)
glue_train_loader = DataLoader(glue_encoded_dataset['train'], batch_size=8, shuffle=True)
glue_val_loader = DataLoader(glue_encoded_dataset['validation'], batch_size=8)

# Настройка оптимизатора и планировщика
optimizer = AdamW(list(model.parameters()) + list(classifier.parameters()), lr=2e-5)
total_steps = len(glue_train_loader) * 3  # Пример для 3 эпох
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


# Обучение на объединенном датасете
model.train()
predictor.train()
for epoch in range(3):  # Количество эпох
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch_inputs = batch['input_ids'].to(device)
        batch_labels = batch_inputs.clone()  # Целевой вектор для обучения тоже input_ids
        
        # Маскирование входных данных (например, замена 15% токенов на [MASK])
        # Здесь нужно реализовать маскирование токенов
        
        outputs = model(input_ids=batch_inputs)
        sequence_output = outputs.last_hidden_state  # Выходы последнего скрытого слоя
        logits = predictor(sequence_output)
        
        # Вычисление функции потерь для предсказания следующего токена
        # Перекрестная энтропийная ошибка между логитами и сдвинутыми метками
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = batch_labels[..., 1:].contiguous()
        loss = nn.CrossEntropyLoss()(shift_logits.view(-1, vocab_size), shift_labels.view(-1))
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

# Валидация на GLUE
model.eval()
classifier.eval()
predictions, true_labels = [], []
for batch in tqdm(glue_val_loader, desc="Validation"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = classifier(outputs.pooler_output)
    preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    label_ids = batch['labels'].to('cpu').numpy()
    predictions.extend(preds)
    true_labels.extend(label_ids)

# Вычисление метрик
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

In [None]:
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW

# Загрузка и предобработка датасета
dataset = load_dataset('glue', 'sst2')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128, return_tensors='pt')

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Определение кастомного классификатора
class BertClassifier(torch.nn.Module):
    def __init__(self, num_labels=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        return logits

# Функция потерь дистилляции
def distillation_loss(logits_student, logits_teacher, labels, alpha=0.5, temperature=2.0):
    soft_labels = F.softmax(logits_teacher / temperature, dim=-1)
    student_log_probs = F.log_softmax(logits_student / temperature, dim=-1)
    soft_loss = -(soft_labels * student_log_probs).sum(dim=1).mean()
    hard_loss = F.cross_entropy(logits_student, labels)
    return alpha * hard_loss + (1 - alpha) * soft_loss

# Определение функции вычисления метрик
def compute_metrics(p):
    logits, labels = p
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Определение аргументов обучения
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=1,
)

# Инициализация учительской модели
teacher_model = BertClassifier(num_labels=2)
teacher_model.eval()  # Помечаем модель как учительскую, поместив в режим eval

# Инициализация ученической модели
student_model = BertClassifier(num_labels=2)

# Функция для подсчета параметров модели
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    
    return {'total_params': total_params, 'trainable_params': trainable_params, 'non_trainable_params': non_trainable_params}

# Подсчет и вывод параметров основной модели BERT
bert_params = count_parameters(student_model)
print(f"BERT Model Parameters: Total: {bert_params['total_params']}, Trainable: {bert_params['trainable_params']}, Non-Trainable: {bert_params['non_trainable_params']}")

# Инициализация Trainer
trainer = Trainer(
    model=student_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Кастомный шаг обучения
def training_step(model, batch):
    model.train()
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        teacher_logits = teacher_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    student_logits = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    loss = distillation_loss(student_logits, teacher_logits, batch['labels'])
    return loss

# Замена стандартного шага обучения на кастомный
trainer.training_step = training_step

# Обучение модели
trainer.train()

# Оценка модели
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']}")
print(f"Precision: {results['eval_precision']}")
print(f"Recall: {results['eval_recall']}")
print(f"F1-score: {results['eval_f1']}")

In [None]:
import numpy as np
import torch
from torch.nn import Linear, CrossEntropyLoss
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Загрузка датасета GLUE задачи SST-2
dataset = load_dataset('glue', 'sst2')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Функция для предобработки текстовых данных
def preprocess_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

# Предобработка датасета
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Обертка над BertModel для классификации
class BertClassifier(torch.nn.Module):
    def __init__(self, bert_model, num_labels=2):
        super(BertClassifier, self).__init__()
        self.bert_model = bert_model
        self.classifier = Linear(self.bert_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))

        return loss, logits

# Загрузка предобученной модели
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = BertClassifier(bert_model, num_labels=2)

# Функция для подсчета параметров модели
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    
    return {'total_params': total_params, 'trainable_params': trainable_params, 'non_trainable_params': non_trainable_params}

# Подсчет и вывод параметров основной модели BERT
bert_params = count_parameters(model)
print(f"BERT Model Parameters: Total: {bert_params['total_params']}, Trainable: {bert_params['trainable_params']}, Non-Trainable: {bert_params['non_trainable_params']}")

# DataLoader для обучающего и валидационного наборов данных
train_loader = DataLoader(encoded_dataset["train"], batch_size=8, shuffle=True)
val_loader = DataLoader(encoded_dataset["validation"], batch_size=8)

# Оптимизатор
optimizer = AdamW(model.parameters(), lr=5e-5)

# Функция для обучения модели
def train_epoch(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        loss, _ = model(**inputs, labels=labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    return avg_loss

# Функция для вычисления метрик
def compute_metrics(logits, labels):
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return accuracy, precision, recall, f1

# Обучение модели
epochs = 3
for epoch in range(epochs):
    avg_loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss}")

# Переключение модели в режим оценки
model.eval()

# Сбор предсказаний и меток для валидационного набора
all_logits = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        _, logits = model(**inputs)
        logits = logits.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        all_logits.extend(logits)
        all_labels.extend(labels)

# Вычисление метрик
accuracy, precision, recall, f1 = compute_metrics(np.array(all_logits), np.array(all_labels))

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Bert evaluation

In [None]:
# Функция для подсчета параметров модели
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters()) / 10**6
    print(f'total_params: {total_params:.3f}M')

In [None]:
!git clone https://github.com/EleutherAI/lm-evaluation-harness

In [None]:
cd lm-evaluation-harness

In [None]:
!git checkout 115206dc89dad67b8beaa90051fb52db77f0a529

In [None]:
!pip install -e .

In [None]:
pwd

In [None]:
# !pip install lm_eval

In [None]:
# !pip install evaluate

In [78]:
import json
import sys
from pathlib import Path
from typing import Dict, List, Literal, Optional

from lm_eval import evaluator, tasks

import numpy as np
import torch
from torch.nn import Linear, CrossEntropyLoss
from transformers import BertTokenizer, BertModel, default_data_collator, pipeline, AutoModelForSequenceClassification
from transformers import LlamaForCausalLM, LlamaTokenizer
# from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# from evaluate import evaluator

from lm_eval.base import BaseLM

In [None]:
# dataset = lm_eval.load_dataset("classification", "imdb")

# results = lm_eval.evaluate("bert-base-uncased", dataset, "zero-shot")

In [None]:
# llama = LlamaForCausalLM.from_pretrained(
#     "openlm-research/open_llama_3b",
#     torch_dtype=torch.float16, 
#     device_map='auto',
# )
# bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
# Подсчет и вывод параметров основной модели BERT
# count_parameters(llama)

In [4]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
# bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
# Подсчет и вывод параметров основной модели BERT
count_parameters(bert_model)

NameError: name 'count_parameters' is not defined

In [None]:
# class ModelWrapper:
#     def __init__(self, model):
#         self.model = model
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.rank = 0  # Один процесс, его ранг будет 0
#         self.world_size = 1  # Всего один процесс
    
#     def __getattr__(self, name):
#         return getattr(self.model, name)
    
#     def loglikelihood(self, requests):
#         results = []
#         for request in requests:
#             text = request["text"]
#             inputs = self.tokenizer(text, return_tensors="pt")
#             with torch.no_grad():
#                 outputs = self.model(**inputs)
#                 log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
#                 token_ids = inputs.input_ids
#                 log_likelihood = torch.sum(log_probs[0, range(log_probs.size(1)), token_ids[0]])
#                 results.append(log_likelihood.item())
#         return results

In [84]:
class ModelWrapper(BaseLM):
    def __init__(
        self,
        model, 
        batch_size,
        tokenizer,
        device
    ):
        super().__init__()
        self.model = model
        self.model.to(device)
        self.tokenizer = tokenizer
        self.batch_size_per_gpu = batch_size
        self.device_ = device
    
    @torch.inference_mode()
    def _model_call(self, inps):
        outputs = self.model(inps)
        if hasattr(outputs, 'logits'):
            return outputs.logits
        elif hasattr(outputs, 'last_hidden_state'):
            return outputs.last_hidden_state
        else:
            raise ValueError("Model output does not contain 'logits' or 'last_hidden_state'")
    
    @torch.inference_mode()
    def _model_generate(self, context, max_length, eos_token_id) -> torch.Tensor:
        # this only supports batch size 1
        assert context.shape[0] == 1
        out = generate(self.model, context[0], max_length, eos_id=eos_token_id)
        for block in self.model.transformer.h:
            block.attn.kv_cache.reset_parameters()
        return out.unsqueeze(0)
    
    @property
    def batch_size(self):
        return self.batch_size_per_gpu*2
    
    @property
    def device(self):
        return self.device_
    
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_id
    
    @property
    def max_gen_toks(self):
        return 256
    
    @property
    def max_length(self):
        return self.model.config.max_position_embeddings
    
    def tok_encode(self, string: str) -> List[int]:
        return self.tokenizer.encode(string)

    def tok_decode(self, tokens: List[int]) -> str:
        t = torch.tensor(tokens)
        return self.tokenizer.decode(t)
    
#     def _loglikelihood_tokens(self, requests, disable_tqdm=False, override_bs=None):
#         results = []
#         for (context, continuation), context_enc, continuation_enc in requests:
#             inputs = torch.tensor([context_enc + continuation_enc]).to(self.device)
#             with torch.no_grad():
#                 outputs = self.model(inputs)
#                 logits = outputs.logits if hasattr(outputs, 'logits') else outputs.last_hidden_state
            
#             logits = logits[:, :-1, :].contiguous()
#             continuation_toks = torch.tensor(continuation_enc).to(self.device)

#             # Ensure continuation_toks indices are within the valid range
#             continuation_toks = continuation_toks.clamp(max=logits.size(-1) - 1)

#             gathered_logits = torch.gather(logits, 2, continuation_toks.unsqueeze(-1)).squeeze(-1)
#             log_probs = torch.log_softmax(gathered_logits, dim=-1)

#             results.append((log_probs.sum().item(), (log_probs.argmax(-1) == continuation_toks).all().item()))

#         return results

In [None]:
# from datasets import load_dataset
# from evaluate import evaluator
# from transformers import AutoModelForSequenceClassification

# # Load PiQA dataset
# piqa_data = load_dataset("piqa")

# # Initialize the TextClassificationEvaluator
# task_evaluator = evaluator("text-classification")

# # Define the model
# model_name = "bert-base-uncased"

# # Load the model
# model = ModelWrapper(bert_model)

# # Evaluate the model on the PiQA dataset
# eval_results = task_evaluator.compute(
#     model_or_pipeline=model,
#     data=piqa_data,
#     input_column="question",  # Assuming "question" is the column containing the input text
#     label_column="label",     # Assuming "label" is the column containing the labels
# )

# # Print the evaluation results
# print(eval_results)


In [80]:
llama = LlamaForCausalLM.from_pretrained(
    "openlm-research/open_llama_3b",
    torch_dtype=torch.float16, 
    device_map='auto',
)

In [87]:
eval_tasks: List[str] = ["piqa"]

BATCH_SIZE = 2
num_fewshot = 0
limit = 256*4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

wrapped_model = ModelWrapper(
    model=llama,
    batch_size=BATCH_SIZE,
    tokenizer=LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b"),
    device=device
)
    


In [88]:
results = evaluator.evaluate(
    lm=wrapped_model,
    task_dict=tasks.get_task_dict(eval_tasks),
    num_fewshot=num_fewshot,
#     limit=limit
)

Task: piqa; number of docs: 1838
Task: piqa; document 0; context prompt (starting on next line):
Question: Remove seeds from  strawberries
Answer:
(end of prompt on previous line)
Requests: [Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Blend the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
, Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Chop up the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
]
Running loglikelihood requests


  0%|          | 3/3676 [00:00<01:49, 33.49it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 7.06 MiB is free. Process 26624 has 14.74 GiB memory in use. Of the allocated memory 14.40 GiB is allocated by PyTorch, and 219.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [36]:
print(results)

NameError: name 'results' is not defined

Выводы

Метод Inheritune:

Преимущества: Быстрое обучение, меньше данных для обучения.

Недостатки: Ограниченная гибкость архитектуры, зависит от качества и объема данных.

Метод дистилляции знаний:

Преимущества: Высокая точность, особенно при наличии большого объема данных.

Недостатки: Высокие вычислительные затраты, сложность настройки.

Когда использовать:

Inheritune: Подходит для случаев с ограниченными вычислительными ресурсами и данными. Идеально для быстрого прототипирования и ситуаций, когда необходима производительность на устройствах с ограниченными ресурсами.
Дистилляция знаний: Подходит для случаев, когда требуется высокая точность и есть доступ к большому объему данных и вычислительных ресурсов. Идеально для конечных моделей в производственных условиях.