# Text Classification (LLM FINE-TUNING)

In [None]:
!pip install datasets transformers evaluate torch peft wandb #installing needed packages

### IMPORT STATMENTS & DATA LOADING

In [5]:
import os
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

About the dataset : The **dair-ai/emotion** is an incredibly useful resource for the field of Natural Language Processing. It has been specifically designed to aid in emotion recognition tasks. The dataset is comprised of a large number of text entries, each of which has been tagged with one of six primary emotion labels: joy, sadness, anger, fear, love, and surprise. This labelling system is incredibly valuable, as it enables the development and training of models that can accurately identify and understand the emotional nuances present in written text.


In [6]:
# load dataset
dataset = load_dataset("dair-ai/emotion", trust_remote_code=True)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### MODEL TRAINING

In [10]:
model_checkpoint = 'distilbert-base-uncased' # Model size: 67M params.

# defining label maps.
id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
label2id = {"sadness":0, "joy":1, "love": 2, "anger":3, "fear":4, "surprise": 5}

# generating classification model from the model checkpoint.
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=6, id2label=id2label, label2id=label2id, force_download=True)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
print("MODEL ARCHITECTURE: ", model)

MODEL ARCHITECTURE:  DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=

In [12]:
# Initialize the tokenizer from a pre-trained model checkpoint
# `add_prefix_space=True` is often used with LLM to handle spaces before tokens properly
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True, force_download=True)

# Check if the tokenizer has a padding token defined; if not, add one
if tokenizer.pad_token is None:
    # Define the pad token; this is necessary for models that require explicit padding tokens
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # If the tokenizer's vocabulary is extended (e.g., by adding a pad token), resize the model's token embeddings
    # This step ensures the model's embeddings match the tokenizer's vocabulary size
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [13]:
# Define a tokenization function to process text data
def tokenize_function(examples):
    """
    Tokenizes text from the 'text' key in the input examples.

    Args:
        examples (dict): A dictionary containing text samples.

    Returns:
        dict: Tokenized inputs as NumPy arrays.
    """
    # Extract text data
    text = examples["text"]

    # Set truncation side and tokenize text
    tokenizer.truncation_side = "left"

    # Tokenize the text. This operation converts text into tokens or indices
    # that are understood by the model.
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512  # Maximum length of tokens to be generated. Adjust based on model and GPU memory.
    )
    return tokenized_inputs


In [14]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [15]:
# created a data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# define prefered metric, in this case we use accuracy.
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
# Define an evaluation function for use with a trainer
def compute_metrics(eval_data):
    """
    Calculates accuracy of model predictions against true labels.

    Args:
        eval_data (tuple): A tuple containing the model's predictions and the true labels.

    Returns:
        dict: A dictionary with the computed accuracy.
    """
    # Extract predictions and labels from the tuple
    predictions, labels = eval_data
    # Determine the most likely class from logits
    predictions = np.argmax(predictions, axis=1)

    # Calculate and return accuracy
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [19]:
# Define a list of emotion-laden text samples for evaluation
emotion_texts = [
    "I can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake", #sadness
    "I do not feel reassured anxiety is on each side", #joy
    "I now feel compromised and skeptical of the value of every unit of work I put in", #fear
    "i was ready to meet mom in the airport and feel her ever supportive arms around me", #love
    "I keep feeling pleasantly surprised at his supportiveness and also his ease in new situations", #surprise
    "im feeling bitter today my mood has been strange the entire day so i guess its that" #anger
]

print("Untrained model predictions:")
print("----------------------------")
for text in emotion_texts:
    # Tokenize the text to format suitable for the model
    inputs = tokenizer.encode(text, return_tensors="pt")

    # Generate logits (raw output from last layer before activation function)
    logits = model(inputs).logits

    # Determine the predicted class from logits
    predicted_label_index = torch.argmax(logits)

    # Convert the predicted label index to the corresponding label name
    label_name = id2label[predicted_label_index.tolist()]

    print(f"{text} -> {label_name}")


Untrained model predictions:
----------------------------
I can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake -> surprise
I do not feel reassured anxiety is on each side -> surprise
I now feel compromised and skeptical of the value of every unit of work I put in -> surprise
i was ready to meet mom in the airport and feel her ever supportive arms around me -> surprise
I keep feeling pleasantly surprised at his supportiveness and also his ease in new situations -> surprise
im feeling bitter today my mood has been strange the entire day so i guess its that -> surprise


### PRETRAINED EFFICIENT FINE-TUNING (PEFT): *A LoRA (Low-Rank Adaptation)*

In [20]:
# Configuration for PEFT using Low-Rank Adapters (LoRA)
peft_config = LoraConfig(
    task_type="SEQ_CLS",  # Specify the task type as Sequence Classification (multi-class)
    r=4,                  # Rank of the low-rank matrices in LoRA
    lora_alpha=32,        # Scaling factor for the low-rank adaptation
    lora_dropout=0.01,    # Dropout rate in the LoRA layers to prevent overfitting
    target_modules=['q_lin']  # Target model modules to apply LoRA, e.g., the query linear transformation
)

In [21]:
# Print the configuration to verify settings
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [22]:
model = get_peft_model(model, peft_config)

# Print the parameters of the model that are trainable after the PEFT configuration has been applied.
# This is useful for verifying which parts of the model can be updated during the training process
# and to ensure that the PEFT modifications were applied correctly.
model.print_trainable_parameters()

trainable params: 632,070 || all params: 67,590,156 || trainable%: 0.9351509708011326


In [23]:
# Hyperparameters for training the model

# Learning rate: Controls the step size during model updates
lr = 1e-3

# Batch size: Number of training samples processed before the model is updated
batch_size = 4

# Number of epochs: Total number of complete passes through the training dataset
num_epochs = 10


In [24]:
# Set up Weights & Bias for Model Training & Eval Monitoring
import wandb
wandb.login()
os.environ["WANDB_PROJECT"] = "emotion-text-classification"
wandb.init(project="emotion-text-classification", entity="stephanieekekwe")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Currently logged in as: [33mstephanieekekwe[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [25]:
# Define training arguments for the Trainer
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",  # Directory to save output files
    learning_rate=lr,  # Learning rate for training
    per_device_train_batch_size=batch_size,  # Training batch size per device
    per_device_eval_batch_size=batch_size,  # Evaluation batch size per device
    num_train_epochs=num_epochs,  # Total number of training epochs
    weight_decay=0.01,  # L2 regularization to prevent overfitting
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    report_to="wandb",  # Log training progress to Weights & Biases
    run_name="distilbert-base-uncased-high-lr",  # Name for the Weights & Biases run
    logging_steps=1,  # Log metrics to Weights & Biases after every step
)


In [26]:
# Initialize the Trainer object with configuration and datasets
trainer = Trainer(
    model=model,  # Pre-trained model to be fine-tuned
    args=training_args,  # Training configurations from the cell above.
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset for evaluation
    tokenizer=tokenizer,  # Tokenizer for preprocessing text data
    data_collator=data_collator,  # Function to dynamically pad batch data to equal length
    compute_metrics=compute_metrics,  # Function to compute metrics during training/validation
)

# Start training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2,0.55623,{'accuracy': 0.871}
2,0.0254,0.680678,{'accuracy': 0.867}
3,0.4343,0.480164,{'accuracy': 0.883}
4,0.0002,0.51396,{'accuracy': 0.887}
5,1.0829,0.617638,{'accuracy': 0.89}
6,0.0024,0.496179,{'accuracy': 0.8965}
7,0.3345,0.465523,{'accuracy': 0.906}
8,0.9446,0.373996,{'accuracy': 0.9165}
9,0.0006,0.333437,{'accuracy': 0.9175}
10,0.0357,0.362341,{'accuracy': 0.9155}




TrainOutput(global_step=40000, training_loss=0.4869317439482604, metrics={'train_runtime': 12308.2623, 'train_samples_per_second': 12.999, 'train_steps_per_second': 3.25, 'total_flos': 1471518910155168.0, 'train_loss': 0.4869317439482604, 'epoch': 10.0})

In [27]:
model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in emotion_texts:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    # Generate logits (raw output from last layer before activation function)
    logits = model(inputs).logits

    # Determine the predicted class from logits
    predicted_label_index = torch.argmax(logits)

    # Convert the predicted label index to the corresponding label name
    label_name = id2label[predicted_label_index.tolist()]

    print(f"{text} -> {label_name}")

Trained model predictions:
--------------------------
I can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake -> sadness
I do not feel reassured anxiety is on each side -> joy
I now feel compromised and skeptical of the value of every unit of work I put in -> fear
i was ready to meet mom in the airport and feel her ever supportive arms around me -> love
I keep feeling pleasantly surprised at his supportiveness and also his ease in new situations -> surprise
im feeling bitter today my mood has been strange the entire day so i guess its that -> anger


In [28]:
# Your model evaluation code
test_results = trainer.evaluate(tokenized_dataset["test"])
print(test_results)

#log the results to wandb
wandb.log(test_results)

# Finish the wandb run
wandb.finish()


{'eval_loss': 0.32150763273239136, 'eval_accuracy': {'accuracy': 0.9125}, 'eval_runtime': 81.6411, 'eval_samples_per_second': 24.497, 'eval_steps_per_second': 6.124, 'epoch': 10.0}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁
eval/loss,▆█▄▅▇▄▄▂▁▂▁
eval/runtime,▂▁▃▄▃▆▅▂▁▆█
eval/samples_per_second,▇█▆▄▆▃▄▇█▂▁
eval/steps_per_second,▇█▆▄▆▃▄▇█▂▁
eval_loss,▁
eval_runtime,▁
eval_samples_per_second,▁
eval_steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,10.0
eval/loss,0.32151
eval/runtime,81.6411
eval/samples_per_second,24.497
eval/steps_per_second,6.124
eval_loss,0.32151
eval_runtime,81.6411
eval_samples_per_second,24.497
eval_steps_per_second,6.124
total_flos,1471518910155168.0
