In [None]:
!pip install datasets
!pip install peft
!pip install evaluate
!pip install -U "huggingface_hub[cli]"
! pip install -U accelerate
! pip install -U transformers
! pip install -U bitsandbytes
! pip install accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import bitsandbytes as bnb

In [None]:
!huggingface-cli login

# dataset

In [None]:
# sst2
# The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. It uses the two-way (positive/negative) class split, with only sentence-level labels.
# dataset = load_dataset('csv', data_dir='/sem.csv', split='train')
dataset = load_dataset("sudan94/SemEvalEmoji2018")
dataset

In [None]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

# model

In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the from_pretrained method.

AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:

Instantiating one of AutoModel, AutoConfig and AutoTokenizer will directly create a class of the relevant architecture (ex: model = AutoModel.from_pretrained('bert-base-cased') will create a instance of BertModel).

In [None]:
model_checkpoint = 'distilbert-base-uncased'

bnb_config = BitsAndBytesConfig(
    load_in_4bit =True,
    load_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# define label maps
id2label = {0: "non_ironic", 1: "verbal_irony_polarity_contrast",2:"situational_irony",3:"verbal_irony"}
label2id = {"non_ironic":0, "verbal_irony_polarity_contrast":1,"situational_irony":2,"verbal_irony":3}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id,
    trust_remote_code=True,
     load_in_8bit=False,
      torch_dtype=torch.float32,
    quantization_config=bnb_config,)

In [None]:
# display architecture
model

# preprocess data

Tokenization is a critical first step in preparing data for Large Language Models (LLMs) because these models don't understand raw text; they process numerical data. The tokenizer's role is to convert text into numbers that the model can understand.

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding_side="left")

# add pad token if none exists
# Pad Token (pad_token): In NLP, padding is used to ensure that all sequences (like sentences or paragraphs) are of the same length when feeding them into a model.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["tweet"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, #actual text
        return_tensors="np", # return type numpy array
        truncation=True, #Indicates that truncation should be applied based on the specified parameters
        max_length=512 #Specifies the maximum length of the tokenized sequence
    )

    return tokenized_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# evaluation

In [None]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [None]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuray = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    # print(accuracy)

    return {"accuray": accuray['accuracy'], "f1": f1['f1']}

    # return {"accuracy":accuracy.compute(predictions=predictions, references=labels), "f1":f1_metric.compute(predictions=predictions, references=labels,average="macro")}

# Apply untrained model to text

In [None]:
# define list of examples
text_list = dataset["test"]["tweet"]
ground_truth = dataset["test"]["label"]
# Initialize a list to store the table data
table_data = []
i = 0
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    # print(predictions.tolist())

     # Get the label string
    predicted_label = id2label[predictions.tolist()]
    # Append the text and predicted label to the table data
    table_data.append([text, predicted_label,predictions.tolist(), ground_truth[i]])
    i+=1

    # print(text + " - " + id2label[predictions.tolist()])


In [None]:
# print(table_data)
import pandas as pd
df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
df.head(5)

In [None]:
# from tabulate import tabulate
# print("Untrained model predictions:")
# print("----------------------------")
# print(tabulate(table_data, headers=["tweet", "predicted_class", "predicted_label" ,"actual_label"], tablefmt="grid"))

# Train model

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=64,
                        lora_alpha=16,
                        lora_dropout=0.1, # dropot rate for avoiding overfitting
                        target_modules = ['q_lin'])

In [None]:
peft_config

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# hyperparameters
lr = 1e-3
# lr = 1e-4 #default learning rate
batch_size = 16
num_epochs = 10


In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    logging_steps = 231,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'eval_f1'
)

In [None]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# Generate prediction

In [None]:
best_checkpoint = trainer.state.best_model_checkpoint
best_checkpoint


In [None]:
trainer.state

In [None]:
best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint, num_labels=4, id2label=id2label, label2id=label2id)
best_model
# test = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-lora-text-classification/checkpoint-2079', num_labels=4, id2label=id2label, label2id=label2id)
# test

In [None]:
model


In [None]:
model.to('cpu')
# Initialize a list to store the table data
table_data = []
i=0
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    # Get the label string
    predicted_label = id2label[predictions.tolist()[0]]
    # Append the text and predicted label to the table data
    table_data.append((text, predicted_label,predictions.tolist()[0], ground_truth[i]))
    i+=1

    # print(text + " - " + id2label[predictions.tolist()[0]])
tuned_df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
tuned_df.head(5)


In [None]:
nottuned = "nottuned_result_distillbert_set1_lora.csv"
finetuned = "finetuned_result_distillbert_set1_lora.csv"

df.to_csv(nottuned,  encoding='utf-8')
tuned_df.to_csv(finetuned, encoding='utf-8')


In [None]:
# model.save_pretrained('fine_tuned_model')