In [25]:
!pip install datasets
!pip install peft
!pip install evaluate
!pip install -U "huggingface_hub[cli]"
! pip install -U accelerate
! pip install -U transformers
! pip install -U bitsandbytes



In [26]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import bitsandbytes as bnb

In [27]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 
Token is valid (permission: read).


# dataset

In [28]:
# sst2
# The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. It uses the two-way (positive/negative) class split, with only sentence-level labels.
# dataset = load_dataset('csv', data_dir='/sem.csv', split='train')
dataset = load_dataset("sudan94/SemEvalEmoji2018Binary")
dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 3555
    })
    validation: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 444
    })
    test: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 445
    })
})

In [29]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5037974683544304

# model

In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the from_pretrained method.

AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:

Instantiating one of AutoModel, AutoConfig and AutoTokenizer will directly create a class of the relevant architecture (ex: model = AutoModel.from_pretrained('bert-base-cased') will create a instance of BertModel).

In [30]:

model_checkpoint = 'distilbert-base-uncased'

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype='int8',
)

# define label maps
id2label = {0: "non_ironic", 1: "irony"}
label2id = {"non_ironic":0, "irony":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id,
    config=bnb_config,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# preprocess data

Tokenization is a critical first step in preparing data for Large Language Models (LLMs) because these models don't understand raw text; they process numerical data. The tokenizer's role is to convert text into numbers that the model can understand.

In [32]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding_side="left")

# add pad token if none exists
# Pad Token (pad_token): In NLP, padding is used to ensure that all sequences (like sentences or paragraphs) are of the same length when feeding them into a model.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [33]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["tweet"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, #actual text
        return_tensors="np", # return type numpy array
        truncation=True, #Indicates that truncation should be applied based on the specified parameters
        max_length=512 #Specifies the maximum length of the tokenized sequence
    )

    return tokenized_inputs

In [34]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 3555
    })
    validation: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 444
    })
    test: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 445
    })
})

In [35]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# evaluation

In [36]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [37]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuray = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    # print(accuracy)

    return {"accuray": accuray['accuracy'], "f1": f1['f1']}

    # return {"accuracy":accuracy.compute(predictions=predictions, references=labels), "f1":f1_metric.compute(predictions=predictions, references=labels,average="macro")}

# Apply untrained model to text

In [38]:
# define list of examples
text_list = dataset["test"]["tweet"]
ground_truth = dataset["test"]["label"]
# Initialize a list to store the table data
table_data = []
i = 0
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    # print(predictions.tolist())

     # Get the label string
    predicted_label = id2label[predictions.tolist()]
    # Append the text and predicted label to the table data
    table_data.append([text, predicted_label,predictions.tolist(), ground_truth[i]])
    i+=1

    # print(text + " - " + id2label[predictions.tolist()])


In [39]:
# print(table_data)
import pandas as pd
df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
df.head(5)

Unnamed: 0,tweet,predicted_class,predicted_label,actual_label
0,@restlessduncan @SR_Duncan I accept him the wa...,non_ironic,0,0
1,Happy stfx day! Lost my #Xring in '12. It was ...,non_ironic,0,1
2,K. Michelle said Lil Kim is Plastic but 65% of...,irony,1,1
3,doesn't convey what I want though. #twitterpr...,irony,1,0
4,It's hard to take people seriously who can't s...,irony,1,0


In [40]:
# from tabulate import tabulate
# print("Untrained model predictions:")
# print("----------------------------")
# print(tabulate(table_data, headers=["tweet", "predicted_class", "predicted_label" ,"actual_label"], tablefmt="grid"))

# Train model

In [41]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=64,
                        lora_alpha=16,
                        lora_dropout=0.5, # dropot rate for avoiding overfitting
                        target_modules = ['q_lin'])

In [42]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=64, target_modules={'q_lin'}, lora_alpha=16, lora_dropout=0.5, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [43]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,181,954 || all params: 68,136,964 || trainable%: 1.7346737080918369


In [44]:
# hyperparameters
lr = 4e-5
# lr = 1e-4 #default learning rate
batch_size = 32
num_epochs = 10


In [47]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    logging_steps = 112,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'eval_f1'
)

In [48]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuray,F1
1,0.6571,0.648264,0.599099,0.596022
2,0.649,0.650178,0.599099,0.581903
3,0.6349,0.642617,0.601351,0.59211
4,0.6278,0.630283,0.623874,0.623353
5,0.6216,0.626756,0.637387,0.637444
6,0.6127,0.630869,0.628378,0.624312
7,0.6077,0.625574,0.646396,0.645547
8,0.6061,0.625194,0.644144,0.64319
9,0.6003,0.623082,0.646396,0.645907
10,0.6017,0.624045,0.644144,0.64319




TrainOutput(global_step=1120, training_loss=0.6218933922903879, metrics={'train_runtime': 132.8089, 'train_samples_per_second': 267.678, 'train_steps_per_second': 8.433, 'total_flos': 514736496007272.0, 'train_loss': 0.6218933922903879, 'epoch': 10.0})

# Generate prediction

In [49]:
f = open("trainer-output.txt", "w")
f.write(str(trainer.state))
f.close()
trainer.state

TrainerState(epoch=10.0, global_step=1120, max_steps=1120, logging_steps=112, eval_steps=500, save_steps=500, train_batch_size=32, num_train_epochs=10, num_input_tokens_seen=0, total_flos=514736496007272.0, log_history=[{'loss': 0.6571, 'grad_norm': 2.676650047302246, 'learning_rate': 3.6e-05, 'epoch': 1.0, 'step': 112}, {'eval_loss': 0.6482644081115723, 'eval_accuray': 0.5990990990990991, 'eval_f1': 0.5960218387989945, 'eval_runtime': 0.8312, 'eval_samples_per_second': 534.162, 'eval_steps_per_second': 16.843, 'epoch': 1.0, 'step': 112}, {'loss': 0.649, 'grad_norm': 2.6213972568511963, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.0, 'step': 224}, {'eval_loss': 0.650177538394928, 'eval_accuray': 0.5990990990990991, 'eval_f1': 0.5819032661423965, 'eval_runtime': 0.8463, 'eval_samples_per_second': 524.647, 'eval_steps_per_second': 16.543, 'epoch': 2.0, 'step': 224}, {'loss': 0.6349, 'grad_norm': 3.2090821266174316, 'learning_rate': 2.8e-05, 'epoch': 3.0, 'step': 336}, {'eval_loss'

In [50]:
model.to('cpu')
# Initialize a list to store the table data
table_data = []
i=0
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    # Get the label string
    predicted_label = id2label[predictions.tolist()[0]]
    # Append the text and predicted label to the table data
    table_data.append((text, predicted_label,predictions.tolist()[0], ground_truth[i]))
    i+=1

    # print(text + " - " + id2label[predictions.tolist()[0]])
tuned_df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
tuned_df.head(5)


Trained model predictions:
--------------------------


Unnamed: 0,tweet,predicted_class,predicted_label,actual_label
0,@restlessduncan @SR_Duncan I accept him the wa...,non_ironic,0,0
1,Happy stfx day! Lost my #Xring in '12. It was ...,non_ironic,0,1
2,K. Michelle said Lil Kim is Plastic but 65% of...,irony,1,1
3,doesn't convey what I want though. #twitterpr...,irony,1,0
4,It's hard to take people seriously who can't s...,irony,1,0


In [51]:
nottuned = "nottuned_result_distillbert_set2_qlora.csv"
finetuned = "finetuned_result_distillbert_set2_qlora.csv"

df.to_csv(nottuned,  encoding='utf-8')
tuned_df.to_csv(finetuned, encoding='utf-8')


In [None]:
# model.save_pretrained('fine_tuned_model')