In [1]:
pip install datasets transformers peft evaluate torch numpy

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

**Step:1 - Base model**

In [3]:
model_checkpoint = 'distilbert-base-uncased'
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Step:2- Load data**


In [4]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

data/train-00000-of-00001-5a744bf76a1d84(…):   0%|          | 0.00/836k [00:00<?, ?B/s]

data/validation-00000-of-00001-a3a52fabb(…):   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

**Step:3 Preprocess data**


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


**Step:4 - Evaluation metrics**


In [7]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}


Downloading builder script: 0.00B [00:00, ?B/s]

**Step:5 - Untrained model performance**


In [8]:
text_list = ["It was good.", "Not a fan, don't recommed.",
"Better than the first one.", "This is not worth watching even once.",
"This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


**Step: 6 - Fine-tuning with LoRA**


In [9]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])


In [10]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [11]:
%pip install --upgrade transformers




In [12]:
from transformers import TrainingArguments
import torch

print("TrainingArguments imported successfully!")
lr = 1e-3
batch_size = 4
num_epochs = 10
model_checkpoint = "distilbert-base-uncased"

training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
)

print("TrainingArguments created successfully!")

TrainingArguments imported successfully!
TrainingArguments created successfully!


In [14]:
from transformers import TrainingArguments, Trainer
import torch
import os

os.environ["WANDB_DISABLED"] = "true"

print("TrainingArguments imported successfully!")

lr = 1e-3
batch_size = 4
num_epochs = 10
model_checkpoint = "distilbert-base-uncased"

training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    report_to="none",
)

print("TrainingArguments created successfully!")

print("Trainer imported successfully!")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

TrainingArguments imported successfully!
TrainingArguments created successfully!
Trainer imported successfully!


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
100,No log,0.328517,{'accuracy': 0.875}
200,No log,0.345759,{'accuracy': 0.874}
300,No log,0.46656,{'accuracy': 0.877}
400,No log,0.343968,{'accuracy': 0.901}
500,0.433400,0.398207,{'accuracy': 0.875}
600,0.433400,0.561355,{'accuracy': 0.877}
700,0.433400,0.602739,{'accuracy': 0.866}
800,0.433400,0.512463,{'accuracy': 0.89}
900,0.433400,0.652549,{'accuracy': 0.872}
1000,0.198900,0.585687,{'accuracy': 0.89}


TrainOutput(global_step=2500, training_loss=0.14263318557739257, metrics={'train_runtime': 636.278, 'train_samples_per_second': 15.716, 'train_steps_per_second': 3.929, 'total_flos': 1112883852759936.0, 'train_loss': 0.14263318557739257, 'epoch': 10.0})

**Step:8 - Trained model performance**


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model.to(device)

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Using device: cuda
Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive
