In [1]:
!pip install datasets
!pip install peft
!pip install evaluate
!pip install -U "huggingface_hub[cli]"
! pip install -U accelerate
! pip install -U transformers
! pip install -U bitsandbytes

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer,
    HfArgumentParser)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import bitsandbytes as bnb

In [3]:

!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your termi

# dataset

In [4]:
# sst2
# The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. It uses the two-way (positive/negative) class split, with only sentence-level labels.
# dataset = load_dataset('csv', data_dir='/sem.csv', split='train')
dataset = load_dataset("sudan94/SemEvalEmoji2018Binary")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/348k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.5k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 3555
    })
    validation: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 444
    })
    test: Dataset({
        features: ['index', 'label', 'tweet'],
        num_rows: 445
    })
})

In [5]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5037974683544304

# model

In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the from_pretrained method.

AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:

Instantiating one of AutoModel, AutoConfig and AutoTokenizer will directly create a class of the relevant architecture (ex: model = AutoModel.from_pretrained('bert-base-cased') will create a instance of BertModel).

In [6]:

model_checkpoint = 'distilbert-base-uncased'

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype='int8',
)

# define label maps
id2label = {0: "non_ironic", 1: "irony"}
label2id = {"non_ironic":0, "irony":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id,
    config=bnb_config,
)



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# from peft import prepare_model_for_kbit_training

# #gradient checkpointing to save memory
# model.gradient_checkpointing_enable()

# model=prepare_model_for_kbit_training(
#     model, use_gradient_checkpointing=False
# )
# model.get_memory_footprint()
# print(model)

In [8]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# preprocess data

Tokenization is a critical first step in preparing data for Large Language Models (LLMs) because these models don't understand raw text; they process numerical data. The tokenizer's role is to convert text into numbers that the model can understand.

In [9]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding_side="left")

# add pad token if none exists
# Pad Token (pad_token): In NLP, padding is used to ensure that all sequences (like sentences or paragraphs) are of the same length when feeding them into a model.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["tweet"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, #actual text
        return_tensors="np", # return type numpy array
        truncation=True, #Indicates that truncation should be applied based on the specified parameters
        max_length=512 #Specifies the maximum length of the tokenized sequence
    )

    return tokenized_inputs

In [11]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/3555 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 3555
    })
    validation: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 444
    })
    test: Dataset({
        features: ['index', 'label', 'tweet', 'input_ids', 'attention_mask'],
        num_rows: 445
    })
})

In [12]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# evaluation

In [13]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [14]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuray = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    # print(accuracy)

    return {"accuray": accuray['accuracy'], "f1": f1['f1']}

    # return {"accuracy":accuracy.compute(predictions=predictions, references=labels), "f1":f1_metric.compute(predictions=predictions, references=labels,average="macro")}

# Apply untrained model to text

In [15]:
# define list of examples
text_list = dataset["test"]["tweet"]
ground_truth = dataset["test"]["label"]
# Initialize a list to store the table data
table_data = []
i = 0
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    # print(predictions.tolist())

     # Get the label string
    predicted_label = id2label[predictions.tolist()]
    # Append the text and predicted label to the table data
    table_data.append([text, predicted_label,predictions.tolist(), ground_truth[i]])
    i+=1

    # print(text + " - " + id2label[predictions.tolist()])


In [16]:
# print(table_data)
import pandas as pd
df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
df.head(5)

Unnamed: 0,tweet,predicted_class,predicted_label,actual_label
0,@restlessduncan @SR_Duncan I accept him the wa...,non_ironic,0,0
1,Happy stfx day! Lost my #Xring in '12. It was ...,non_ironic,0,1
2,K. Michelle said Lil Kim is Plastic but 65% of...,non_ironic,0,1
3,doesn't convey what I want though. #twitterpr...,non_ironic,0,0
4,It's hard to take people seriously who can't s...,non_ironic,0,0


In [17]:
# from tabulate import tabulate
# print("Untrained model predictions:")
# print("----------------------------")
# print(tabulate(table_data, headers=["tweet", "predicted_class", "predicted_label" ,"actual_label"], tablefmt="grid"))

# Train model

In [18]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=64,
                        lora_alpha=4,
                        lora_dropout=0.1, # dropot rate for avoiding overfitting
                        target_modules = ['q_lin'])

In [19]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=64, target_modules={'q_lin'}, lora_alpha=4, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [20]:
# model.float()

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,181,954 || all params: 68,136,964 || trainable%: 1.7346737080918369


In [21]:
# hyperparameters
lr = 1e-3
# lr = 1e-4 #default learning rate
batch_size = 16
num_epochs = 10


In [24]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    logging_steps = 223,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'eval_f1'
)

In [25]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuray,F1
1,0.5177,0.633775,0.702703,0.703065
2,0.4854,0.623043,0.677928,0.676565
3,0.4831,0.666784,0.675676,0.671622
4,0.4444,0.647159,0.68018,0.678718
5,0.4069,0.629248,0.691441,0.691766
6,0.3673,0.674328,0.691441,0.691014
7,0.3307,0.728363,0.70045,0.698523
8,0.2977,0.694826,0.716216,0.716585
9,0.2701,0.760003,0.707207,0.707207
10,0.2546,0.74305,0.704955,0.705217




TrainOutput(global_step=2230, training_loss=0.385775621696438, metrics={'train_runtime': 61.6384, 'train_samples_per_second': 576.751, 'train_steps_per_second': 36.179, 'total_flos': 477215879014248.0, 'train_loss': 0.385775621696438, 'epoch': 10.0})

# Generate prediction

In [26]:
best_checkpoint = trainer.state.best_model_checkpoint
best_checkpoint


'distilbert-base-uncased-lora-text-classification/checkpoint-1784'

In [27]:
trainer.state

TrainerState(epoch=10.0, global_step=2230, max_steps=2230, logging_steps=223, eval_steps=500, save_steps=500, train_batch_size=16, num_train_epochs=10, num_input_tokens_seen=0, total_flos=477215879014248.0, log_history=[{'loss': 0.5177, 'grad_norm': 1.4195507764816284, 'learning_rate': 0.0009000000000000001, 'epoch': 1.0, 'step': 223}, {'eval_loss': 0.6337747573852539, 'eval_accuray': 0.7027027027027027, 'eval_f1': 0.7030649070122755, 'eval_runtime': 0.3489, 'eval_samples_per_second': 1272.692, 'eval_steps_per_second': 80.26, 'epoch': 1.0, 'step': 223}, {'loss': 0.4854, 'grad_norm': 2.9216232299804688, 'learning_rate': 0.0008, 'epoch': 2.0, 'step': 446}, {'eval_loss': 0.6230432987213135, 'eval_accuray': 0.6779279279279279, 'eval_f1': 0.6765650113681835, 'eval_runtime': 0.3418, 'eval_samples_per_second': 1299.006, 'eval_steps_per_second': 81.919, 'epoch': 2.0, 'step': 446}, {'loss': 0.4831, 'grad_norm': 4.3689422607421875, 'learning_rate': 0.0007, 'epoch': 3.0, 'step': 669}, {'eval_loss

In [29]:
f = open("trainer-output.txt", "w")
f.write(str(trainer.state))
f.close()
trainer.state

TrainerState(epoch=10.0, global_step=2230, max_steps=2230, logging_steps=223, eval_steps=500, save_steps=500, train_batch_size=16, num_train_epochs=10, num_input_tokens_seen=0, total_flos=477215879014248.0, log_history=[{'loss': 0.5177, 'grad_norm': 1.4195507764816284, 'learning_rate': 0.0009000000000000001, 'epoch': 1.0, 'step': 223}, {'eval_loss': 0.6337747573852539, 'eval_accuray': 0.7027027027027027, 'eval_f1': 0.7030649070122755, 'eval_runtime': 0.3489, 'eval_samples_per_second': 1272.692, 'eval_steps_per_second': 80.26, 'epoch': 1.0, 'step': 223}, {'loss': 0.4854, 'grad_norm': 2.9216232299804688, 'learning_rate': 0.0008, 'epoch': 2.0, 'step': 446}, {'eval_loss': 0.6230432987213135, 'eval_accuray': 0.6779279279279279, 'eval_f1': 0.6765650113681835, 'eval_runtime': 0.3418, 'eval_samples_per_second': 1299.006, 'eval_steps_per_second': 81.919, 'epoch': 2.0, 'step': 446}, {'loss': 0.4831, 'grad_norm': 4.3689422607421875, 'learning_rate': 0.0007, 'epoch': 3.0, 'step': 669}, {'eval_loss

In [30]:
model.to('cpu')
# Initialize a list to store the table data
table_data = []
i=0
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    # Get the label string
    predicted_label = id2label[predictions.tolist()[0]]
    # Append the text and predicted label to the table data
    table_data.append((text, predicted_label,predictions.tolist()[0], ground_truth[i]))
    i+=1

    # print(text + " - " + id2label[predictions.tolist()[0]])
tuned_df = pd.DataFrame(table_data,columns =['tweet','predicted_class','predicted_label','actual_label'])
tuned_df.head(5)


Trained model predictions:
--------------------------


Unnamed: 0,tweet,predicted_class,predicted_label,actual_label
0,@restlessduncan @SR_Duncan I accept him the wa...,non_ironic,0,0
1,Happy stfx day! Lost my #Xring in '12. It was ...,irony,1,1
2,K. Michelle said Lil Kim is Plastic but 65% of...,irony,1,1
3,doesn't convey what I want though. #twitterpr...,irony,1,0
4,It's hard to take people seriously who can't s...,non_ironic,0,0


In [31]:
nottuned = "nottuned_result_distillbert_set1_lora.csv"
finetuned = "finetuned_result_distillbert_set1_lora.csv"

df.to_csv(nottuned,  encoding='utf-8')
tuned_df.to_csv(finetuned, encoding='utf-8')


In [None]:
# model.save_pretrained('fine_tuned_model')