<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/LLM_Fine_tuning_on_custom_binary_classification_using_Distilbert_23_10_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate accelerate



In [2]:
import pandas as pd
df=messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])
df = df.sample(frac = 0.01)
df.head()

Unnamed: 0,label,message
5393,ham,"All done, all handed in. Don't know if mega sh..."
1695,ham,Ü eatin later but i'm eatin wif my frens now l...
1707,ham,Yes! I am a one woman man! Please tell me your...
3604,ham,"I'm not sure, I was just checking out what was..."
4648,ham,God created gap btwn ur fingers so dat sum1 vr...


In [23]:
df["label"] = df["label"].replace({"ham":0, "spam": 1}).astype(int)
df.sample(10)

Unnamed: 0,label,message
1060,1,EASTENDERS TV Quiz. What FLOWER does DOT compa...
2340,0,Cheers for the message Zogtorius. Ive been st...
4120,0,Hiya do u like the hlday pics looked horrible ...
2024,0,Is there any movie theatre i can go to and wat...
1765,1,Hi 07734396839 IBH Customer Loyalty Offer: The...
2423,0,A bloo bloo bloo I'll miss the first bowl
720,0,Macha dont feel upset.i can assume your mindse...
2615,0,"Sir, hope your day is going smoothly. i really..."
1955,0,Good night. Am going to sleep.
2974,0,Happy New Year Princess!


In [24]:
from datasets import Dataset
ds = Dataset.from_pandas(df)
ds


Dataset({
    features: ['label', 'message', '__index_level_0__'],
    num_rows: 56
})

In [25]:
ds = ds.remove_columns(["__index_level_0__"])
ds

Dataset({
    features: ['label', 'message'],
    num_rows: 56
})

In [26]:
ds = ds.train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'message'],
        num_rows: 44
    })
    test: Dataset({
        features: ['label', 'message'],
        num_rows: 12
    })
})

In [27]:
ds["train"][5]

{'label': 0, 'message': "Sorry, I'll call you  later. I am in meeting sir."}

In [28]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [29]:
def preprocess_function(examples):
  return tokenizer(examples["message"], truncation = True)

In [30]:
tokenizer("Worst thing to ever happen out in Gta history")

{'input_ids': [101, 5409, 2518, 2000, 2412, 4148, 2041, 1999, 14181, 2050, 2381, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [31]:
tokenizer.vocab_size

30522

In [32]:
import numpy as np

tokenized_ds = ds.map(preprocess_function, batched=True)
tokenized_ds

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'message', 'input_ids', 'attention_mask'],
        num_rows: 44
    })
    test: Dataset({
        features: ['label', 'message', 'input_ids', 'attention_mask'],
        num_rows: 12
    })
})

In [33]:
ds["train"][11]["message"]


"A bloo bloo bloo I'll miss the first bowl"

In [34]:
tokenized_ds["train"][11]

{'label': 0,
 'message': "A bloo bloo bloo I'll miss the first bowl",
 'input_ids': [101,
  1037,
  1038,
  4135,
  2080,
  1038,
  4135,
  2080,
  1038,
  4135,
  2080,
  1045,
  1005,
  2222,
  3335,
  1996,
  2034,
  4605,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
tokenizer.decode(tokenized_ds["train"][11]["input_ids"])

"[CLS] a bloo bloo bloo i'll miss the first bowl [SEP]"

In [36]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [16]:
import evaluate
accuracy = evaluate.load("accuracy")

In [37]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)


In [38]:
id2label = {0:"ham", 1:"spam"}
label2id = {"ham":0, "spam": 1}

In [39]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 2

batches_per_epoch = len(tokenized_ds["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [40]:
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [41]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [42]:
model.compile(optimizer=optimizer)

In [43]:
from transformers.keras_callbacks import KerasMetricCallback
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [44]:
model.fit(
    x = tf_train_set,
    validation_data = tf_validation_set,
    epochs = 2,
    callbacks = [metric_callback]
)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x795616e85d80>

In [45]:
# Save your trained TensorFlow model to a local directory
model.save_pretrained("my_classification_model")
tokenizer.save_pretrained('my_classification_model')

('my_classification_model/tokenizer_config.json',
 'my_classification_model/special_tokens_map.json',
 'my_classification_model/vocab.txt',
 'my_classification_model/added_tokens.json',
 'my_classification_model/tokenizer.json')

In [46]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained('my_classification_model')
tokenizer = AutoTokenizer.from_pretrained('my_classification_model')

Some layers from the model checkpoint at my_classification_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at my_classification_model and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
from transformers import pipeline
text = "This is awful. I get that profit-wise it was less than expected due to a huge budget"
tokenizer = AutoTokenizer.from_pretrained("my_classification_model")
inputs = tokenizer(text, return_tensors="tf")
inputs


{'input_ids': <tf.Tensor: shape=(1, 22), dtype=int32, numpy=
array([[ 101, 2023, 2003, 9643, 1012, 1045, 2131, 2008, 5618, 1011, 7968,
        2009, 2001, 2625, 2084, 3517, 2349, 2000, 1037, 4121, 5166,  102]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 22), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int32)>}

In [48]:
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("my_classification_model")
outputs = model(**inputs)
outputs

Some layers from the model checkpoint at my_classification_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at my_classification_model and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.1539846 , -0.02860738]], dtype=float32)>, hidden_states=None, attentions=None)

In [49]:
predicted_class_id = int(tf.math.argmax(outputs.logits, axis=-1)[0])
predicted_class_id

0

In [50]:
model.config.id2label[predicted_class_id]

'ham'

In [51]:
from sklearn.metrics import confusion_matrix, classification_report

# Get predictions for the validation set
predictions = model.predict(tf_validation_set)
predicted_labels = np.argmax(predictions.logits, axis=1)

# Extract true labels
true_labels = np.concatenate([y for x, y in tf_validation_set], axis=0)

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:\n", conf_matrix)

# Classification report for precision, recall, and F1 score
report = classification_report(true_labels, predicted_labels, target_names=["ham", "spam"])
print("Classification Report:\n", report)

Confusion Matrix:
 [[10  0]
 [ 2  0]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.83      1.00      0.91        10
        spam       0.00      0.00      0.00         2

    accuracy                           0.83        12
   macro avg       0.42      0.50      0.45        12
weighted avg       0.69      0.83      0.76        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
# Prediction on new data
new_data = ["This is a new message.", "Another spam example!"]
new_encodings = tokenizer(new_data, truncation=True, padding=True, return_tensors="tf")
new_inputs = {key: tf.convert_to_tensor(val) for key, val in new_encodings.items()}
new_outputs = model(new_inputs)

predicted_class_ids = tf.argmax(new_outputs.logits, axis=-1).numpy()
predicted_labels = [model.config.id2label[int(id_)] for id_ in predicted_class_ids]

for text, label in zip(new_data, predicted_labels):
    print(f"Message: '{text}' - Predicted Label: {label}")

Message: 'This is a new message.' - Predicted Label: ham
Message: 'Another spam example!' - Predicted Label: ham
