In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  _torch_pytree._register_pytree_node(
2024-08-16 00:32:00.291809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-16 00:32:00.408065: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-16 00:32:00.408995: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  _torch_pytree._register_pytree_node(


In [4]:
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(id2label))




model.safetensors:  74%|#######4  | 199M/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
dataset = load_dataset('shawhin/imdb-truncated')
dataset

Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

def tokenize_function(examples):
    text = examples['text']
    tokenizer.truncation_side = 'left'

    return tokenizer(
        text,
        return_tensors='np',
        truncation=True,
        max_length=512,
    )

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load('accuracy')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
sample_text = [
    "It was terrible",
    "This film is great",
    "Not worth watching even once",
    "Best film ever",
    "Better than the first one",
    "Great choice if you want to waste your time",
    "This is a pass",
    "Intricate and beautiful",
]

print('Untrained predictions:')
print('-'*20)

for text in sample_text:
    inputs = tokenizer.encode(text, return_tensors='pt')

    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + ' - ' + id2label[predictions.tolist()])

Untrained predictions:
--------------------
It was terrible - Positive
This film is great - Positive
Not worth watching even once - Positive
Best film ever - Positive
Better than the first one - Positive
Great choice if you want to waste your time - Positive
This is a pass - Positive
Intricate and beautiful - Positive


In [17]:
peft_config = LoraConfig(task_type='SEQ_CLS',
                        r=4,
                        lora_alpha=0.5,
                        lora_dropout=0.01,
                        target_modules=['q_lin'],
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

lr = 0.001
batch_size = 4
num_epochs = 3

training_args = TrainingArguments(
    output_dir=model_checkpoint + '-lora-text-classification',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


  0%|          | 0/750 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7293, 'learning_rate': 0.0009866666666666667, 'epoch': 0.04}
{'loss': 0.8629, 'learning_rate': 0.0009733333333333334, 'epoch': 0.08}
{'loss': 0.7215, 'learning_rate': 0.00096, 'epoch': 0.12}
{'loss': 0.6931, 'learning_rate': 0.0009466666666666667, 'epoch': 0.16}
{'loss': 0.6209, 'learning_rate': 0.0009333333333333333, 'epoch': 0.2}
{'loss': 0.5877, 'learning_rate': 0.00092, 'epoch': 0.24}
{'loss': 0.5334, 'learning_rate': 0.0009066666666666666, 'epoch': 0.28}
{'loss': 0.5183, 'learning_rate': 0.0008933333333333333, 'epoch': 0.32}
{'loss': 0.5172, 'learning_rate': 0.00088, 'epoch': 0.36}
{'loss': 0.3398, 'learning_rate': 0.0008666666666666667, 'epoch': 0.4}
{'loss': 0.4424, 'learning_rate': 0.0008533333333333334, 'epoch': 0.44}
{'loss': 0.7029, 'learning_rate': 0.00084, 'epoch': 0.48}
{'loss': 0.351, 'learning_rate': 0.0008266666666666666, 'epoch': 0.52}
{'loss': 0.4129, 'learning_rate': 0.0008133333333333333, 'epoch': 0.56}
{'loss': 0.5478, 'learning_rate': 0.0008, 'epoch': 

  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.868}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.4001295268535614, 'eval_accuracy': {'accuracy': 0.868}, 'eval_runtime': 11.4126, 'eval_samples_per_second': 87.623, 'eval_steps_per_second': 21.906, 'epoch': 1.0}
{'loss': 0.1628, 'learning_rate': 0.0006533333333333333, 'epoch': 1.04}
{'loss': 0.418, 'learning_rate': 0.00064, 'epoch': 1.08}
{'loss': 0.3456, 'learning_rate': 0.0006266666666666668, 'epoch': 1.12}
{'loss': 0.1987, 'learning_rate': 0.0006133333333333334, 'epoch': 1.16}
{'loss': 0.2395, 'learning_rate': 0.0006, 'epoch': 1.2}
{'loss': 0.1368, 'learning_rate': 0.0005866666666666667, 'epoch': 1.24}
{'loss': 0.2601, 'learning_rate': 0.0005733333333333334, 'epoch': 1.28}
{'loss': 0.3229, 'learning_rate': 0.0005600000000000001, 'epoch': 1.32}
{'loss': 0.2135, 'learning_rate': 0.0005466666666666667, 'epoch': 1.36}
{'loss': 0.3367, 'learning_rate': 0.0005333333333333334, 'epoch': 1.4}
{'loss': 0.473, 'learning_rate': 0.0005200000000000001, 'epoch': 1.44}
{'loss': 0.4849, 'learning_rate': 0.0005066666666666668, 'epoc

  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.33785662055015564, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 11.9569, 'eval_samples_per_second': 83.634, 'eval_steps_per_second': 20.908, 'epoch': 2.0}
{'loss': 0.3313, 'learning_rate': 0.00032, 'epoch': 2.04}
{'loss': 0.1662, 'learning_rate': 0.0003066666666666667, 'epoch': 2.08}
{'loss': 0.1909, 'learning_rate': 0.0002933333333333333, 'epoch': 2.12}
{'loss': 0.4892, 'learning_rate': 0.00028000000000000003, 'epoch': 2.16}
{'loss': 0.2353, 'learning_rate': 0.0002666666666666667, 'epoch': 2.2}
{'loss': 0.1457, 'learning_rate': 0.0002533333333333334, 'epoch': 2.24}
{'loss': 0.4721, 'learning_rate': 0.00024, 'epoch': 2.28}
{'loss': 0.0937, 'learning_rate': 0.00022666666666666666, 'epoch': 2.32}
{'loss': 0.1637, 'learning_rate': 0.00021333333333333336, 'epoch': 2.36}
{'loss': 0.2804, 'learning_rate': 0.0002, 'epoch': 2.4}
{'loss': 0.456, 'learning_rate': 0.0001866666666666667, 'epoch': 2.44}
{'loss': 0.4029, 'learning_rate': 0.00017333333333333334, 'epoch': 2.48

  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.29631802439689636, 'eval_accuracy': {'accuracy': 0.9}, 'eval_runtime': 12.1839, 'eval_samples_per_second': 82.075, 'eval_steps_per_second': 20.519, 'epoch': 3.0}
{'train_runtime': 112.4103, 'train_samples_per_second': 26.688, 'train_steps_per_second': 6.672, 'train_loss': 0.3764014965693156, 'epoch': 3.0}


TrainOutput(global_step=750, training_loss=0.3764014965693156, metrics={'train_runtime': 112.4103, 'train_samples_per_second': 26.688, 'train_steps_per_second': 6.672, 'train_loss': 0.3764014965693156, 'epoch': 3.0})

In [19]:
model.to('cpu')

print('Trained predictions:')
print('-'*20)
for text in sample_text:
    inputs = tokenizer.encode(text, return_tensors='pt').to('cpu')
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + ' - ' + id2label[predictions.tolist()])

Trained predictions:
--------------------
It was terrible - Negative
This film is great - Positive
Not worth watching even once - Positive
Best film ever - Positive
Better than the first one - Positive
Great choice if you want to waste your time - Positive
This is a pass - Positive
Intricate and beautiful - Positive
