# Lightweight Fine-Tuning Project

## TODO: In this cell, describe your choices for each of the following

* PEFT technique: LoRA
* Model: gpt2
* Evaluation approach:Transformer trainer 
* Fine-tuning dataset: SST-2

In [5]:
!pip install transformers
!pip install peft
!pip install datasets
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install tqdm



In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from transformers import DataCollatorWithPadding
from peft import LoraConfig, PeftModelForSequenceClassification, TaskType, AutoPeftModelForSequenceClassification
from datasets import Dataset,load_dataset
import numpy as np
import pandas as pd
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

In [9]:
from IPython.display import display

In [8]:
df = pd.read_csv("twitter.csv")

display(df.head())

Unnamed: 0.1,Unnamed: 0,tweet,emoji
0,0,bet you'll get hungry,heart_eyes
1,1,starbucks employee confuses boyfriend by sayin...,yum
2,2,when your starbucks store makes you an iced mo...,sob
3,3,"being told ""girl your romper looks fierce!"" at...",blush
4,4,"i got a starbucks drink at school today, shit ...",sob


In [12]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='twitter.csv')

Generating train split: 0 examples [00:00, ? examples/s]

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [21]:
# Define a function to preprocess the dataset and convert emojis to IDs
def preprocess_data(example):
    unique_emojis = list(set(example['emoji']))
    emoji2id = {emoji: i for i, emoji in enumerate(unique_emojis)}
    example['label'] = [emoji2id[emoji] for emoji in example['emoji']]
    return example

# Apply preprocessing to convert emojis to labels
dataset = dataset.map(preprocess_data)

# Split the dataset into training and validation sets
train_test_split_ratio = 0.9
train_dataset, val_dataset = dataset['train'].train_test_split(test_size=1-train_test_split_ratio).values()

# Load a tokenizer from the transformers library
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Ensure the tokenizer has a pad token; if not, set it
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'

# Define a function to tokenize and encode the dataset
def tokenize_and_encode(examples):
    tokenized_inputs = tokenizer(
        examples['tweet'],
        padding="max_length",
        truncation=True,
        max_length=128  # Adjusted max_length for typical tweet length
    )
    tokenized_inputs['labels'] = examples['label']
    return tokenized_inputs

# Tokenize and encode the training and validation datasets
train_dataset = train_dataset.map(tokenize_and_encode, batched=True)
val_dataset = val_dataset.map(tokenize_and_encode, batched=True)

# Output to verify the first example in each set
print("First training example:", train_dataset[0])
print("First validation example:", val_dataset[0])

Map:   0%|          | 0/225331 [00:00<?, ? examples/s]

Map:   0%|          | 0/202797 [00:00<?, ? examples/s]

Map:   0%|          | 0/22534 [00:00<?, ? examples/s]

First training example: {'Unnamed: 0': 155452, 'tweet': "i bet they'll love it", 'emoji': 'blush', 'label': [0, 1, 4, 2, 3], 'input_ids': [101, 1045, 6655, 2027, 1005, 2222, 2293, 2009, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0

In [7]:
num_labels = len(emoji2id)

# Load the pre-trained model with the specified number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=num_labels
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.config.pad_token_id = tokenizer.pad_token_id

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the compute metrics function
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [10]:
from transformers import TrainingArguments, Trainer

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1,
)

# Initialize the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [11]:
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Evaluation Results: {'eval_loss': 4.871279716491699, 'eval_accuracy': 0.17373746338865714, 'eval_f1': 0.0516624114305804, 'eval_precision': 0.14234237501388408, 'eval_recall': 0.17373746338865714, 'eval_runtime': 77516.0005, 'eval_samples_per_second': 0.291, 'eval_steps_per_second': 0.005}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
print(model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=10, bias=False)
)


In [13]:
model.score

Linear(in_features=768, out_features=10, bias=False)

## Load Tokenizer and tokenize the dataset

In [6]:


# Load the pre-trained GPT-2 model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=len(emoji2id))
model.config.pad_token_id = model.config.eos_token_id  # Set padding token ID to EOS token ID

# Configure LoRA for the model
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,              # Rank of the approximation
    lora_alpha=16,    # LoRA alpha parameter, scaling the learning rate of LoRA parameters
    lora_dropout=0.1  # Dropout rate for LoRA layers
)

# Wrap the GPT-2 model with the PEFT configuration
peft_model = PeftModelForSequenceClassification(model, peft_config)

# Print trainable parameters (adjust based on actual implementation details)
try:
    peft_model.print_trainable_parameters()
except AttributeError:
    # If the PEFT package does not have 'print_trainable_parameters', manually iterate through parameters
    for name, param in peft_model.named_parameters():
        if param.requires_grad:
            print(name, param.data.size())


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 155,136 || all params: 124,602,624 || trainable%: 0.1245




In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1, "precision": precision, "recall": recall}

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results/peft_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs/peft_model',
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1,
)

# Initialize the Trainer with compute_metrics
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [11]:
peft_model.save_pretrained("./peft_model")

In [24]:
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

[6338/6338 2:46:28]
 Epoch 1/1 Epoch: 1, Training Loss: 1.9241, Validation Loss: 1.9065, Accuracy: 33.87%, F1: 28.86%, Precision: 40.97%, Recall: 33.87% [353/353 03:39]
 Evaluation Results: {'eval_loss': 1.9065, 'eval_accuracy': 33.87%, 'eval_f1': 28.86%, 'eval_precision': 40.97%, 'eval_recall': 33.87%, 'eval_runtime': 219.9013 s, 'eval_samples_per_second': 102.473, 'eval_steps_per_second': 1.605, 'epoch': 1}


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [16]:
inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
    "model/peft_model",  
    num_labels=len(emoji2id)
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
inference_model.config.pad_token_id = inference_model.config.eos_token_id


In [18]:
trainer = Trainer(
    model=inference_model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [25]:
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

[353/353 08:15]
 Evaluation Results: {'eval_loss': 1.9065, 'eval_accuracy': 33.87%, 'eval_f1': 28.86%, 'eval_precision': 40.97%, 'eval_recall': 33.87%, 'eval_runtime': 219.28 s, 'eval_samples_per_second': 102.764, 'eval_steps_per_second': 1.61}


Result - This project showed that gpt-2 shows more accurate and effective results. 