<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/NLP_with_LLMs_Fine_tuning_Models_for_Language_Translation%2C_%26_Summarization_01_10_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample example:

In [None]:
#!pip install datasets
import pandas as pd
from datasets import load_dataset
dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(dataset["train"])
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [None]:
df.shape

(27481, 4)

In [None]:
df.isna().sum()

Unnamed: 0,0
id,0
text,0
label,0
label_text,0


In [None]:
# !pip install transformers
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
  return tokenizer(examples["text"], padding = "max_length", truncation = True)

tokenized_dataset = dataset.map(tokenize_function, batched = True)

tokenized_dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 3534
    })
})

In [None]:
tokenized_df = pd.DataFrame(tokenized_dataset["train"])
tokenized_df.head()

Unnamed: 0,id,text,label,label_text,input_ids,attention_mask
0,cb774db0d1,"I`d have responded, if I were going",1,neutral,"[314, 63, 67, 423, 7082, 11, 611, 314, 547, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative,"[1406, 2238, 311, 2885, 314, 481, 2051, 345, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,088c60f138,my boss is bullying me...,0,negative,"[1820, 6478, 318, 20714, 502, 986, 50256, 5025...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,9642c003ef,what interview! leave me alone,0,negative,"[644, 2720, 0, 2666, 502, 3436, 50256, 50256, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative,"[27989, 286, 25998, 11, 1521, 3521, 63, 83, 48...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(10))

In [None]:
small_train_dataset

Dataset({
    features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [None]:
from transformers import GPT2ForSequenceClassification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels = 3)
model

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [None]:
# !pip install evaluate
import evaluate
import numpy as np
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions  = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,

)

trainer.train()


Step,Training Loss


TrainOutput(global_step=6, training_loss=3.0331859588623047, metrics={'train_runtime': 478.0487, 'train_samples_per_second': 0.063, 'train_steps_per_second': 0.013, 'total_flos': 12542357274624.0, 'train_loss': 3.0331859588623047, 'epoch': 2.4})

In [None]:
import evaluate

trainer.evaluate()
# metric = evaluate.load("accuracy")
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)
# compute_metrics(trainer.predict(small_eval_dataset))

{'eval_loss': 2.5287108421325684,
 'eval_accuracy': 0.1,
 'eval_runtime': 57.3922,
 'eval_samples_per_second': 0.174,
 'eval_steps_per_second': 0.174,
 'epoch': 2.4}

In [None]:
output = trainer.predict(small_eval_dataset)

# Unpack the outputs
logits = output.predictions
labels = output.label_ids

# Now you can compute predictions
predictions = np.argmax(logits, axis=-1)

# Call compute_metrics function
metrics = compute_metrics((logits, labels))
metrics

{'accuracy': 0.1}

In [None]:
logits

array([[ 1.8999815 , -1.4035559 ,  1.943276  ],
       [ 3.1610389 , -2.614882  ,  1.3763287 ],
       [ 1.489253  , -0.51882946,  0.23901868],
       [ 2.2315078 , -2.2309866 ,  3.0570273 ],
       [ 1.999481  , -1.347069  ,  1.431882  ],
       [ 1.5112588 , -1.5235393 ,  1.5932722 ],
       [ 1.7315803 , -1.3584174 ,  1.5358338 ],
       [ 1.9536246 , -1.3123722 ,  1.4064628 ],
       [ 1.5912011 , -2.4191923 ,  2.6964874 ],
       [ 2.7110806 , -1.9383863 ,  1.5845839 ]], dtype=float32)

In [None]:
labels

array([0, 1, 2, 0, 0, 1, 1, 2, 1, 2])

In [None]:
small_train_dataset["label"]

[1, 2, 1, 2, 1, 1, 0, 2, 2, 1]