<a href="https://colab.research.google.com/github/soyebganja/DL-Practice-Projects/blob/main/13%3ATransformers/13_11_Hugging_Face_Spam_Classification_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# from datasets import load_dataset

import pandas as pd
import numpy as np


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [14]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df.Category = df.Category.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,4825
1,747


In [17]:
df_spam = df[df.Category==1]
df_spam.shape

(747, 2)

In [18]:
df_ham = df[df.Category==0].sample(1000)
df_ham.shape

(1000, 2)

In [19]:
df_small = pd.concat([df_spam, df_ham])
df_small.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,1000
1,747


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_small.Message, df_small.Category, test_size=0.2, random_state=5)

In [21]:
y_train.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,789
1,608


In [22]:
y_test.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,211
1,139


In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text, labels):
  encodings = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
  return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels, dtype=torch.float)

tokenize_text(["Hurry up, click here", "I will see you tommorrow"], [1, 0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(tensor([[  101,  9241,  2039,  1010, 11562,  2182,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [24]:
X_train.values.tolist()[:3]

['You can stop further club tones by replying "STOP MIX" See my-tone.com/enjoy. html for terms. Club tones cost GBP4.50/week. MFL, PO Box 1146 MK45 2WT (2/3)',
 "Prabha..i'm soryda..realy..frm heart i'm sory",
 'Babe ! How goes that day ? What are you doing ? Where are you ? I sip my cappuccino and think of you, my love ... I send a kiss to you from across the sea']

In [25]:
y_train.head(3)

Unnamed: 0,Category
3642,1
5226,0
2181,0


In [26]:
train_input_ids, train_attention_mask, train_labels = tokenize_text(X_train.tolist(), y_train.tolist())
test_input_ids, test_attention_mask, test_labels = tokenize_text(X_test.tolist(), y_test.tolist())

In [27]:
train_labels[:2]

tensor([1., 0.])

In [28]:
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels)

In [29]:
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [30]:
bert = BertModel.from_pretrained('bert-base-uncased')
bert.config.hidden_size

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

768

In [31]:
class SentimentClassifier(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert = bert

    for param in self.bert.parameters():
      param.requires_grad = False # Freeze all BERT layers

    self.classifier = nn.Sequential(
        nn.Linear(bert.config.hidden_size, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, 1),
        nn.Sigmoid()
    )

  def forward(self, input_ids, attention_mask):
    bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    last_hidden_state = bert_outputs.last_hidden_state[:, 0, :]
    return self.classifier(last_hidden_state)

In [32]:
model = SentimentClassifier()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

model = model.to(device)
criterion = criterion.to(device)

In [33]:
epochs = 2

for epoch in range(epochs):
  model.train()
  total_training_loss = 0

  for batch, (input_ids, attention_mask, labels) in enumerate(train_loader):
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs.squeeze(), labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}/{epochs}, Batch: {batch+1}/{len(train_loader)}, Loss: {loss.item()}')
    total_training_loss += loss.item()

  ave_train_loss = total_training_loss / len(train_loader)
  print(f'Epoch: {epoch+1}/{epochs}, Training Loss: {ave_train_loss}')

Epoch: 1/2, Batch: 1/22, Loss: 0.7086018919944763
Epoch: 1/2, Batch: 2/22, Loss: 0.6557713150978088
Epoch: 1/2, Batch: 3/22, Loss: 0.5744476914405823
Epoch: 1/2, Batch: 4/22, Loss: 0.507362961769104
Epoch: 1/2, Batch: 5/22, Loss: 0.4714023470878601
Epoch: 1/2, Batch: 6/22, Loss: 0.4023903012275696
Epoch: 1/2, Batch: 7/22, Loss: 0.3170091509819031
Epoch: 1/2, Batch: 8/22, Loss: 0.3400247395038605
Epoch: 1/2, Batch: 9/22, Loss: 0.2636844515800476
Epoch: 1/2, Batch: 10/22, Loss: 0.2647849917411804
Epoch: 1/2, Batch: 11/22, Loss: 0.21989504992961884
Epoch: 1/2, Batch: 12/22, Loss: 0.20415765047073364
Epoch: 1/2, Batch: 13/22, Loss: 0.21066822111606598
Epoch: 1/2, Batch: 14/22, Loss: 0.20252670347690582
Epoch: 1/2, Batch: 15/22, Loss: 0.13651296496391296
Epoch: 1/2, Batch: 16/22, Loss: 0.10260770469903946
Epoch: 1/2, Batch: 17/22, Loss: 0.12761786580085754
Epoch: 1/2, Batch: 18/22, Loss: 0.12248926609754562
Epoch: 1/2, Batch: 19/22, Loss: 0.08608849346637726
Epoch: 1/2, Batch: 20/22, Loss: 

In [36]:
model.eval()
total_val_loss = 0
current_predictions = 0

with torch.no_grad():
  for input_ids, attention_mask, labels in test_loader:
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs.squeeze(), labels)

    total_val_loss += loss.item()
    current_predictions += torch.round(outputs.squeeze()).eq(labels).sum().item()

ave_val_loss = total_val_loss / len(test_loader)
val_accuracy = current_predictions / len(test_dataset)
print(f'Validation Loss: {ave_val_loss}, Validation Accuracy: {val_accuracy}')

Validation Loss: 0.0801349204654495, Validation Accuracy: 0.9714285714285714


In [41]:
def predict(model, text, max_length=128):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  # Tokenizer input Text
  encoded_text = tokenizer(
      text,
      padding='max_length',
      truncation=True,
      max_length=max_length,
      return_tensors='pt'
  )

  input_ids = encoded_text['input_ids'].to(device)
  attention_mask = encoded_text['attention_mask'].to(device)

  model = model.to(device)
  model.eval()

  with torch.no_grad():
    outputs = model(input_ids, attention_mask)
    prediction = torch.round(outputs.squeeze()).item()

  return 'spam' if prediction == 1 else 'ham'


In [42]:
predict(model, "Hurry up, click here")

'spam'

In [43]:
predict(model, "This is your last chance to win the cash, click this link")

'spam'

In [44]:
predict(model, "Dear Shoaib, I hope to see you on Monday")

'ham'

In [45]:
predict(model, "Free entry in 2 week camp to win FA final tickets 21 may 2025")

'spam'