In [1]:
import torch

In [2]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)


2024-11-24 17:54:10.002390: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-24 17:54:10.019863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732470850.041523   60710 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732470850.048062   60710 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 17:54:10.070777: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
import evaluate

In [4]:
import numpy as np

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google/mobilebert-uncased",
    num_labels=2,
)

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
del model

In [5]:
from transformers import BertConfig

In [6]:
num_hidden_layers = 8
hidden_size = 256
num_attention_heads = 4

model = AutoModelForSequenceClassification.from_pretrained(
    f"bert-uncased_L-{num_hidden_layers}_H-{hidden_size}_A-{num_attention_heads}",
    config=BertConfig(
        hidden_size=hidden_size,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        intermediate_size=1024,
        num_labels=2,
    )
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-uncased_L-8_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model = model.to("cuda")

In [8]:
def create_model():
    num_hidden_layers = 8
    hidden_size = 256
    num_attention_heads = 4

    model = AutoModelForSequenceClassification.from_pretrained(
        f"bert-uncased_L-{num_hidden_layers}_H-{hidden_size}_A-{num_attention_heads}",
        config=BertConfig(
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=1024,
            num_labels=2,
        )
    )
    model = model.to("cuda")
    return model

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

In [10]:
from datasets import load_dataset

In [11]:
reviews = load_dataset(
    "parquet",
    data_files="reviews-text/reviews-unlabeled.parquet.snappy",
)

In [12]:
reviews

DatasetDict({
    train: Dataset({
        features: ['reviewText'],
        num_rows: 1697533
    })
})

In [13]:
def tokenize_reviews(batch):
    return tokenizer(batch["reviewText"], truncation=True, max_length=512)

In [14]:
tokenized_reviews = reviews.map(tokenize_reviews, batched=True)

In [15]:
sst2 = load_dataset("stanfordnlp/sst2")

In [16]:
def tokenize_sst2(batch):
    return tokenizer(batch["sentence"], truncation=True, max_length=512)

In [17]:
tokenized_sst2 = sst2.map(tokenize_sst2, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
from torch import nn

In [20]:
import torch.nn.functional as F

In [21]:
from dataclasses import dataclass

In [22]:
@dataclass
class PretrainedDistillationTrainingArguments(TrainingArguments):
    temperature: float = 1.0

In [23]:
class PretrainedDistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.kl_loss = nn.KLDivLoss(reduction="batchmean")
    
    def compute_loss(self, model, inputs, return_outputs=False):
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits
        loss = self.kl_loss(
            F.log_softmax(student_logits / self.args.temperature, dim=-1),
            F.softmax(teacher_logits / self.args.temperature, dim=-1),
        )
        return (loss, student_outputs) if return_outputs else loss

In [24]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    return accuracy.compute(
        predictions=np.argmax(predictions, axis=1),
        references=labels,
    )

In [51]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
teacher_model.load_state_dict(torch.load("sst2-base.pt", weights_only=True))

<All keys matched successfully>

In [53]:
teacher_model = teacher_model.to("cuda")

In [54]:
distillation_training_args = PretrainedDistillationTrainingArguments(
    output_dir=f"tmp/sst2-distillation",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
    temperature=10.0
)

In [55]:
distillation_trainer = PretrainedDistillationTrainer(
    model=model,
    teacher_model=teacher_model,
    args=distillation_training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [56]:
distillation_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0108,0.023131,0.813073
2,0.0088,0.023418,0.818807
3,0.0073,0.023603,0.822248
4,0.0074,0.023367,0.818807
5,0.0064,0.023607,0.817661


TrainOutput(global_step=21050, training_loss=0.008899580046286775, metrics={'train_runtime': 1046.4749, 'train_samples_per_second': 321.79, 'train_steps_per_second': 20.115, 'total_flos': 454224414765732.0, 'train_loss': 0.008899580046286775, 'epoch': 5.0})

In [None]:
del model
model = create_model()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-uncased_L-8_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
distillation_training_args = PretrainedDistillationTrainingArguments(
    output_dir=f"tmp/sst2-distillation",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
    temperature=2.0
)

In [60]:
distillation_trainer = PretrainedDistillationTrainer(
    model=model,
    teacher_model=teacher_model,
    args=distillation_training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [61]:
distillation_trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=21050, training_loss=0.040827941124342966, metrics={'train_runtime': 1046.3804, 'train_samples_per_second': 321.819, 'train_steps_per_second': 20.117, 'total_flos': 454224414765732.0, 'train_loss': 0.040827941124342966, 'epoch': 5.0})

In [66]:
del model
model = create_model()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-uncased_L-8_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
distillation_training_args = PretrainedDistillationTrainingArguments(
    output_dir=f"tmp/sst2-distillation",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
    temperature=2.0
)

In [68]:
distillation_trainer = PretrainedDistillationTrainer(
    model=model,
    teacher_model=teacher_model,
    args=distillation_training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [69]:
distillation_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2016,0.185791,0.858945
2,0.1362,0.18907,0.862385
3,0.0869,0.164493,0.880734
4,0.079,0.178829,0.87844
5,0.0601,0.172882,0.888761


TrainOutput(global_step=21050, training_loss=0.13676211759200288, metrics={'train_runtime': 1048.7449, 'train_samples_per_second': 321.093, 'train_steps_per_second': 20.072, 'total_flos': 454224414765732.0, 'train_loss': 0.13676211759200288, 'epoch': 5.0})

In [33]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    "google/mobilebert-uncased",
    num_labels=2,
)

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from lora import wrap_bert_model_with_lora

In [35]:
teacher_model = wrap_bert_model_with_lora(teacher_model, rank=8, alpha=8)

In [36]:
teacher_model.load_state_dict(torch.load("sst2-lora.pt", weights_only=True))

<All keys matched successfully>

In [25]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
teacher_model.load_state_dict(torch.load("sst2-base.pt", weights_only=True))

<All keys matched successfully>

In [37]:
teacher_model = teacher_model.to("cuda")

In [None]:
del model
model = create_model()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-uncased_L-8_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
distillation_training_args = PretrainedDistillationTrainingArguments(
    output_dir=f"tmp/sst2-distillation",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
    temperature=2.0,
)

In [39]:
distillation_trainer = PretrainedDistillationTrainer(
    model=model,
    teacher_model=teacher_model,
    args=distillation_training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [40]:
distillation_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0819,0.088678,0.880734
2,0.0454,0.067668,0.885321
3,0.034,0.060648,0.896789
4,0.0283,0.059582,0.892202
5,0.0238,0.056708,0.895642


TrainOutput(global_step=21050, training_loss=0.06234071564504483, metrics={'train_runtime': 1516.0025, 'train_samples_per_second': 222.127, 'train_steps_per_second': 13.885, 'total_flos': 454224414765732.0, 'train_loss': 0.06234071564504483, 'epoch': 5.0})

In [41]:
del model
model = create_model()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-uncased_L-8_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
distillation_training_args = PretrainedDistillationTrainingArguments(
    output_dir=f"tmp/sst2-distillation",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
    temperature=2.0,
)

In [43]:
distillation_trainer = PretrainedDistillationTrainer(
    model=model,
    teacher_model=teacher_model,
    args=distillation_training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [44]:
distillation_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1006,0.100616,0.860092
2,0.0505,0.071937,0.884174
3,0.0345,0.068676,0.894495
4,0.0284,0.062725,0.885321
5,0.0215,0.058561,0.899083
6,0.0183,0.055581,0.900229
7,0.0154,0.053376,0.901376
8,0.0149,0.052535,0.90711
9,0.0133,0.053416,0.905963
10,0.0119,0.053279,0.900229


TrainOutput(global_step=42100, training_loss=0.04210776954535351, metrics={'train_runtime': 3043.128, 'train_samples_per_second': 221.315, 'train_steps_per_second': 13.834, 'total_flos': 907846567642716.0, 'train_loss': 0.04210776954535351, 'epoch': 10.0})

In [45]:
sum([param.numel() for param in model.parameters() if param.requires_grad])

14330114

In [23]:
training_args = TrainingArguments(
    output_dir=f"tmp/sst2-compact",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3041,0.353995,0.864679
2,0.2346,0.372156,0.875
3,0.1751,0.414538,0.884174
4,0.1631,0.455341,0.877294
5,0.1487,0.491209,0.875


TrainOutput(global_step=21050, training_loss=0.2303163706265266, metrics={'train_runtime': 543.0132, 'train_samples_per_second': 620.141, 'train_steps_per_second': 38.765, 'total_flos': 454224414765732.0, 'train_loss': 0.2303163706265266, 'epoch': 5.0})

In [42]:
training_args = TrainingArguments(
    output_dir=f"tmp/sst2-compact",
    learning_rate=2e-4,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    push_to_hub=False,
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3461,0.47628,0.808486
2,0.3096,0.567677,0.784404


KeyboardInterrupt: 