<a href="https://www.kaggle.com/code/pranavjha24/spam-classification-bert-handling-imbalance-data?scriptVersionId=189786617" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv
/kaggle/input/bert/keras/bert_base_en_uncased/2/config.json
/kaggle/input/bert/keras/bert_base_en_uncased/2/tokenizer.json
/kaggle/input/bert/keras/bert_base_en_uncased/2/metadata.json
/kaggle/input/bert/keras/bert_base_en_uncased/2/model.weights.h5
/kaggle/input/bert/keras/bert_base_en_uncased/2/assets/tokenizer/vocabulary.txt


 # Spam Classification using BERT with Handling Imbalanced Data

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import keras_nlp

2024-07-25 22:08:34.435471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 22:08:34.435590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 22:08:34.580838: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Import the dataset

In [3]:
import pandas as pd

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='ISO-8859-1')
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df1 = df.drop(['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 3'], axis='columns')
df1.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df2 = df1.rename(columns={'v1': 'Category', 'v2': 'Message'})
df2.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df2.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [7]:
df2['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
747/4825

0.15481865284974095

**15% spam emails, 85% ham emails: This indicates class imbalance**

In [9]:
df_spam = df2[df2['Category']=='spam']
df_spam.shape

(747, 2)

In [10]:
df_ham = df2[df2['Category']=='ham']
df_ham.shape

(4825, 2)

In [11]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [12]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [13]:
df_balanced['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [14]:
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
138,spam,You'll not rcv any more msgs from the chat svc...,1
2088,spam,Well done ENGLAND! Get the official poly ringt...,1
5377,spam,The current leading bid is 151. To pause this ...,1
3640,spam,You can stop further club tones by replying \S...,1
5518,ham,"By the way, i've put a skip right outside the ...",0


## Split the Data into Training and Test Sets

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [16]:
X_train.head(4)

4132    FreeMsg Today's the day if you are ready! I'm ...
5292    Urgent! Please call 09061213237 from landline....
879     U have a Secret Admirer who is looking 2 make ...
929     money!!! you r a lucky winner ! 2 claim your p...
Name: Message, dtype: object

# Import the BERT model and obtain embedding vectors
### Now let's import the BERT model and get embedding vectors for a few sample statements

In [17]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [18]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435169 , -0.5132726 , -0.88845724, ..., -0.7474888 ,
        -0.75314724,  0.91964495],
       [-0.8720837 , -0.50543994, -0.94446695, ..., -0.8584751 ,
        -0.7174536 ,  0.88083   ]], dtype=float32)>

## Embedding vectors
### Get embedding vectors for a few sample words and compare them using cosine similarity

In [19]:
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])

array([[0.9911089]], dtype=float32)

Values near to 1 indicate high similarity, while values near to 0 indicate significant differences. For example, comparing the words "banana" and "grapes" might yield a similarity score of 0.99, as they are both fruits and thus share similar contextual meanings.

In [21]:
cosine_similarity([e[0]],[e[3]])

array([[0.8470389]], dtype=float32)

Comparing "banana" with "Jeff Bezos" might yield a similarity score of 0.84, which is still relatively high but not as close as the 0.99 similarity score obtained with "grapes". This reflects that while "banana" and "Jeff Bezos" have some contextual overlap, "banana" and "grapes" are much more closely related in terms of their contextual meanings.

In [22]:
cosine_similarity([e[3]],[e[4]])

array([[0.9872035]], dtype=float32)

# Model Building

There are two types of models you can build in TensorFlow:

1. **Sequential**
2. **Functional**

So far, we have built a Sequential model. Below, we will build a Functional model. More information on these two types of models can be found [here](https://becominghuman.ai/sequential-vs-functional-model-in-keras-20684f766057).


In [23]:
# Define input layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Define BERT preprocessor and encoder from keras_nlp
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased", trainable=True)
encoder = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")

# Preprocess text input
encoder_inputs = preprocessor(text_input)

# Get BERT outputs
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]

# Neural network layers
dropout = tf.keras.layers.Dropout(0.1, name="dropout")(pooled_output)
dense = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(dropout)

# Construct the final model
model = tf.keras.Model(inputs=text_input, outputs=dense)

# Compile the model (optional, depending on your use case)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Attaching 'model.safetensors' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.safetensors.index.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'preprocessor.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.safetensors' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...

In [24]:
len(X_train)

1120

In [25]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

## Model Training

In [26]:
# model.fit(X_train, y_train, epochs=10)

In [27]:
# model.evaluate(X_test, y_test)

In [28]:
# y_predicted = model.predict(X_test)
# y_predicted = y_predicted.flatten()

In [29]:
# import numpy as np

# y_predicted = np.where(y_predicted > 0.5, 1, 0)
# y_predicted

In [30]:
# from sklearn.metrics import confusion_matrix, classification_report

# cm = confusion_matrix(y_test, y_predicted)
# cm 

In [31]:
# from matplotlib import pyplot as plt
# import seaborn as sn
# sn.heatmap(cm, annot=True, fmt='d')
# plt.xlabel('Predicted')
# plt.ylabel('Truth')

In [32]:
# print(classification_report(y_test, y_predicted))

# Inference

In [33]:
# reviews = [
#     'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
#     'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
#     'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
#     'Hey Sam, Are you coming for a cricket game tomorrow',
#     "Why don't you wait 'til at least wednesday to see if you get your ."
# ]
# model.predict(reviews)