<a href="https://www.kaggle.com/code/rohitgadhwar/spam-classification-using-transformers-bert?scriptVersionId=116187757" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Load Dataset

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

In [2]:
# Load dataset
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
messages = list(data['v2'])
labels = list(data['v1'])

In [4]:
y = list(pd.get_dummies(labels, drop_first=True)['spam'])

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(messages, y, test_size=0.2, random_state=0)

## Setup Transformers

In [6]:
# Install transformers
! pip install -q transformers

[0m

In [7]:
import transformers
transformers.__version__

'4.20.1'

In [8]:
import tensorflow as tf
tf.__version__

'2.6.4'

In [9]:
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [10]:
train_encodings = tokenizer(x_train,
                            truncation=True,
                            padding=True)

val_encodings = tokenizer(x_test,
                            truncation=True,
                            padding=True)

In [11]:
# convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_test
))

2023-01-12 14:49:45.011531: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-12 14:49:45.112046: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-12 14:49:45.113208: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-12 14:49:45.116334: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

## Load Model

In [12]:
# Load Model
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2023-01-12 14:50:11.394737: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

## Train Model

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

# start training model
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3


  return py_builtins.overload_of(f)(*args)
2023-01-12 14:50:24.166555: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6db48b7b50>

## Evaluate model

In [14]:
model.evaluate(val_dataset.shuffle(100).batch(16))



[0.03439069911837578, 0.994618833065033]

## Save model

In [15]:
model.save_pretrained("./custom_model")

In [16]:
y_pred = []
for text in x_test:
    predict_input = tokenizer.encode(text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    
    output = model.predict(predict_input)[0]
    predictions = tf.nn.softmax(output, axis=1).numpy()
    pred = np.argmax(predictions, axis=1)
    y_pred.append(pred)

In [17]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.9946188340807175
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       955
           1       0.96      1.00      0.98       160

    accuracy                           0.99      1115
   macro avg       0.98      1.00      0.99      1115
weighted avg       0.99      0.99      0.99      1115

