In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from peft import AutoPeftModelForSequenceClassification, LoraConfig, TaskType, get_peft_model

In [2]:
data = load_dataset("sms_spam", split = "train").train_test_split(test_size = 0.2, shuffle = True, seed = 21)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
data.shape

{'train': (4459, 2), 'test': (1115, 2)}

In [4]:
splits = ["train", "test"]

In [5]:
data["train"][0]

{'sms': 'URGENT!! Your 4* Costa Del Sol Holiday or £5000 await collection. Call 09050090044 Now toClaim. SAE, TC s, POBox334, Stockport, SK38xh, Cost£1.50/pm, Max10mins\n',
 'label': 1}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [7]:
tokenized_data = {}

In [8]:
for split in splits:
    
    tokenized_data[split] = data[split].map(
        lambda x : tokenizer(x["sms"], truncation = True),
        batched = True
    )

In [9]:
tokenized_data["train"]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 2,
    id2label = {0 : "Not Spam", 1 : "Spam"},
    label2id = {"Not Spam" : 0, "Spam" : 1}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
for param in model.parameters():
    
    param.requires_grad = True

In [12]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
def compute_metrics(pred):
    
    predictions, labels = pred
    predictions = np.argmax(predictions, axis = 1)
    
    return {"ACCURACY" : (predictions == labels).mean()}

In [14]:
trainer = Trainer(
        
            model = model,
    
            args = TrainingArguments(
            
                    output_dir = "./data/spam_not_spam",
                    learning_rate = 2e-5,
                    per_device_train_batch_size = 32,
                    per_device_eval_batch_size = 32,
                    num_train_epochs = 1,
                    weight_decay = 0.01,
                    evaluation_strategy = "epoch",
                    save_strategy = "epoch",
                    load_best_model_at_end = True),
    
            train_dataset = tokenized_data["train"],
            eval_dataset = tokenized_data["test"],
            tokenizer = tokenizer,
            data_collator = DataCollatorWithPadding(tokenizer = tokenizer),
            compute_metrics = compute_metrics

        )

In [15]:
# trainer.train()

In [16]:
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6125432252883911,
 'eval_ACCURACY': 0.8708520179372198,
 'eval_runtime': 3.1488,
 'eval_samples_per_second': 354.108,
 'eval_steps_per_second': 11.115}

In [17]:
lora_config = LoraConfig(
                task_type = TaskType.SEQ_CLS,
                inference_mode = False,
                r = 16,
                target_modules = ['q', 'k', 'v', 'q_lin', 'k_lin', 'v_lin'],
                lora_alpha = 32,
                lora_dropout = 0.05
                
            )

In [18]:
lora_model = get_peft_model(model, lora_config)

In [19]:
lora_model.print_trainable_parameters()

trainable params: 1,626,628 || all params: 67,989,508 || trainable%: 2.3924691439155583


In [20]:
lora_trainer = Trainer(
        
            model = lora_model,
    
            args = TrainingArguments(
            
                    output_dir = "./lora_model",
                    learning_rate = 2e-5,
                    per_device_train_batch_size = 32,
                    per_device_eval_batch_size = 32,
                    num_train_epochs = 2,
                    weight_decay = 0.01,
                    evaluation_strategy = "epoch",
                    save_strategy = "epoch",
                    load_best_model_at_end = True),
    
            train_dataset = tokenized_data["train"],
            eval_dataset = tokenized_data["test"],
            tokenizer = tokenizer,
            data_collator = DataCollatorWithPadding(tokenizer = tokenizer),
            compute_metrics = compute_metrics

        )

In [21]:
lora_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.204528,0.870852
2,No log,0.114275,0.96861


Checkpoint destination directory ./lora_model/checkpoint-140 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=280, training_loss=0.2507209777832031, metrics={'train_runtime': 50.4012, 'train_samples_per_second': 176.94, 'train_steps_per_second': 5.555, 'total_flos': 179898355937160.0, 'train_loss': 0.2507209777832031, 'epoch': 2.0})

In [22]:
lora_trainer.evaluate()

{'eval_loss': 0.11427542567253113,
 'eval_ACCURACY': 0.968609865470852,
 'eval_runtime': 2.4266,
 'eval_samples_per_second': 459.495,
 'eval_steps_per_second': 14.424,
 'epoch': 2.0}

In [23]:
lora_model.save_pretrained("lora_model")

In [24]:
best_model = AutoPeftModelForSequenceClassification.from_pretrained(
    
                    "lora_model",
                    num_labels = 2
                )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tests_ = tokenized_data["test"].select(range(50))

In [26]:
preds = []

for i in tests_:
    
    inputs_ = tokenizer(i["sms"], truncation=True, padding=True, return_tensors="pt", max_length=128)
    
    
    with torch.no_grad():
        logits = best_model(**inputs_).logits
        pred_ = logits.argmax().item()
        preds.append(pred_)
    

In [27]:
df = pd.DataFrame(
                    {"sms": tests_["sms"], "actual_class" : tests_["label"], "predicted_class": preds}
                 )

In [28]:
df

Unnamed: 0,sms,actual_class,predicted_class
0,Pls dont forget to study\n,0,0
1,Ok. Me watching tv too.\n,0,0
2,"Short But Cute: ""Be a good person, but dont tr...",0,0
3,"Lets use it next week, princess :)\n",0,0
4,Gud mrng dear have a nice day\n,0,0
5,My friend just got here and says he's upping h...,0,0
6,"Sir Goodmorning, Once free call me.\n",0,0
7,"Yeah I am, so I'll leave maybe 7ish?\n",0,0
8,So what did the bank say about the money?\n,0,0
9,"Got hella gas money, want to go on a grand nat...",0,0
