<a href="https://www.kaggle.com/code/sharooqfarzeenak/fine-tuning-gpt-2-to-recognize-tweet-sentiments?scriptVersionId=211178032" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [61]:
# !pip install datasets pandas transformers evaluate numpy

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


## Loading and analyzing the dataset

In [36]:
from datasets import load_dataset

In [37]:
dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

In [39]:
dataset['train']

Dataset({
    features: ['id', 'text', 'label', 'label_text'],
    num_rows: 27481
})

In [40]:
import pandas as pd

df = pd.DataFrame(dataset['train'])

In [41]:
df.head()

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [42]:
df.columns

Index(['id', 'text', 'label', 'label_text'], dtype='object')

## Tokenizing

In [43]:
from transformers import GPT2Tokenizer

In [47]:
# Loading the dataset to train our model
dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [44]:
# Initializing the gpt-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [48]:
def tokenize(element):
   return tokenizer(element["text"], padding="max_length", truncation=True)

tokenized_data = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/27481 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [49]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 3534
    })
})

In [51]:
# Extracting train and eval sets from tokenized_data, which is a Dataset dictionary
# and trimming the dataset to 1000 because of performance limitations
tokenized_train_dataset = tokenized_data["train"].shuffle(seed=42).select(range(1000))
tokenized_eval_dataset = tokenized_data["test"].shuffle(seed=42).select(range(1000))

## Initializing our model3)

In [55]:
# Number of labels in our classification
num_labels = len(df['label'].unique())

In [52]:
from transformers import GPT2ForSequenceClassification

# Loading GPT-2 model for fine-tuning
# number of labels = number of emotions in our dataset
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

## Creating evaluation method to pass on to trainer

In [62]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Fine-Tuning

In [63]:
from transformers import TrainingArguments, Trainer

#### TrainingArguments

##### Parameters

output_dir (str) – The output directory where the model predictions and checkpoints will be written.

overwrite_output_dir (bool, optional, defaults to False) – If True, overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint directory.

do_train (bool, optional, defaults to False) – Whether to run training or not.

do_eval (bool, optional, defaults to False) – Whether to run evaluation on the dev set or not.

do_predict (bool, optional, defaults to False) – Whether to run predictions on the test set or not.

evaluate_during_training (bool, optional, defaults to False) – Whether to run evaluation during training at each logging step or not.

per_device_train_batch_size (int, optional, defaults to 8) – The batch size per GPU/TPU core/CPU for training.

per_device_eval_batch_size (int, optional, defaults to 8) – The batch size per GPU/TPU core/CPU for evaluation.

gradient_accumulation_steps – (int, optional, defaults to 1): Number of updates steps to accumulate the gradients for, before performing a backward/update pass.

learning_rate (float, optional, defaults to 5e-5) – The initial learning rate for Adam.

weight_decay (float, optional, defaults to 0) – The weight decay to apply (if not zero).

adam_epsilon (float, optional, defaults to 1e-8) – Epsilon for the Adam optimizer.

max_grad_norm (float, optional, defaults to 1.0) – Maximum gradient norm (for gradient clipping).

num_train_epochs (float, optional, defaults to 3.0) – Total number of training epochs to perform.

max_steps (int, optional, defaults to -1) – If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs.

warmup_steps (int, optional, defaults to 0) – Number of steps used for a linear warmup from 0 to learning_rate.

logging_dir (str, optional) – Tensorboard log directory. Will default to runs/**CURRENT_DATETIME_HOSTNAME**.

logging_first_step (bool, optional, defaults to False) – Wheter to log and evalulate the first global_step or not.

logging_steps (int, optional, defaults to 500) – Number of update steps between two logs.

save_steps (int, optional, defaults to 500) – Number of updates steps before two checkpoint saves.

save_total_limit (int, optional) – If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.

no_cuda (bool, optional, defaults to False) – Wherher to not use CUDA even when it is available or not.

seed (int, optional, defaults to 42) – Random seed for initialization.

fp16 (bool, optional, defaults to False) – Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.

fp16_opt_level (str, optional, defaults to ‘O1’) – For fp16 training, apex AMP optimization level selected in [‘O0’, ‘O1’, ‘O2’, and ‘O3’]. See details on the apex documentation.

local_rank (int, optional, defaults to -1) – During distributed training, the rank of the process.

tpu_num_cores (int, optional) – When training on TPU, the mumber of TPU cores (automatically passed by launcher script).

debug (bool, optional, defaults to False) – When training on TPU, whether to print debug metrics or not.

dataloader_drop_last (bool, optional, defaults to False) – Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not.

eval_steps (int, optional, defaults to 1000) – Number of update steps between two evaluations.

past_index (int, optional, defaults to -1) – Some models like TransformerXL or :doc`XLNet <../model_doc/xlnet>` can make use of the past hidden states for their predictions. If this argument is set to a positive int, the Trainer will use the corresponding output (usually index 2) as the past state and feed it to the model at the next training step under the keyword argument mems.del at the next training step under the keyword argument mems.

In [64]:
# Configuring the trainer
training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_eval_dataset,
   compute_metrics=compute_metrics

)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


## Model Evaluation

In [None]:
import evaluate

trainer.evaluate()