# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading Dataset

In [1]:
!pip install -q "datasets==2.15.0"

In [2]:
from datasets import load_dataset

# Load the train and test split
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("ag_news", split=splits))}

for split in splits:
    ds[split] = ds[split].shuffle(seed=42).select(range(500))

ds

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 })}

## Loading Tokenizer

In [3]:
# Preprocessing the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    """preprocess the dataset by returning the tokenized examples
    """
    
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True)
    return tokens

In [4]:
# testing the tokenizer function

tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)
    
tokenized_ds["train"][0]["input_ids"][:10]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[101, 7269, 11498, 2135, 6924, 2011, 9326, 4559, 10134, 2031]

## Load and Evaluating the foundational model

In [5]:
# Loading a pre-trained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=4,
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Freeze all the parameters in the base model
for param in model.base_model.parameters():
    param.requires_grad = False
    
model.classifier

Linear(in_features=768, out_features=4, bias=True)

In [7]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
# Defining the evaluation metric

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [9]:
# Training the foundational model
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/news_classification",
        learning_rate=2e-3,
        
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        
        num_train_epochs=1,
        weight_decay=0.01,
        
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.472183,0.842


Checkpoint destination directory ./data/news_classification/checkpoint-32 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=32, training_loss=0.6697850227355957, metrics={'train_runtime': 19.647, 'train_samples_per_second': 25.449, 'train_steps_per_second': 1.629, 'total_flos': 66236061696000.0, 'train_loss': 0.6697850227355957, 'epoch': 1.0})

In [10]:
# Evaluating the model
trainer.evaluate()

{'eval_loss': 0.47218263149261475,
 'eval_accuracy': 0.842,
 'eval_runtime': 8.5705,
 'eval_samples_per_second': 58.34,
 'eval_steps_per_second': 3.734,
 'epoch': 1.0}

In [11]:
# View some results
import pandas as pd

df = pd.DataFrame(tokenized_ds["test"])
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = trainer.predict(tokenized_ds["test"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(2)

Unnamed: 0,text,label,predicted_label
0,Indian board plans own telecast of Australia s...,1,0
1,Stocks Higher on Drop in Jobless Claims A shar...,2,2


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [12]:
model.classifier

Linear(in_features=768, out_features=4, bias=True)

In [13]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [14]:
# Instantiating a LoRA configuration, a subclass used for the low rank adaptation
from peft import LoraConfig
peft_config = LoraConfig(target_modules=[
        "q_lin",
        "k_lin",
        "v_lin",
        "out_lin",
])

In [15]:
# Converting the transformer model into PEFT model
from peft import get_peft_model
lora_model = get_peft_model(model, peft_config)

In [16]:
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 67,251,460 || trainable%: 0.43852133470410903


In [17]:
# unfreeze all the model parameter
for param in lora_model.parameters():
    param.requires_grad = True

In [18]:
print(lora_model)

PeftModel(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(
                  in_features=768, out_features=768, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lo

In [19]:
# Fine tuning the model

import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir=".data/news_classification",
        
        learning_rate = 2e-5,
        
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        
        num_train_epochs=1,
        weight_decay=0.01,
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        
        load_best_model_at_end = True,
    ),
    
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [None]:
# Evaluating the fine-tuned model
trainer.evaluate()

In [None]:
# View some results
import pandas as pd

df = pd.DataFrame(tokenized_ds["test"])
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = trainer.predict(tokenized_ds["test"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(2)

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.