# Classify email as ham or spam based on the context 
Use dataset from kaggle - 
https://www.kaggle.com/datasets/shantanudhakadd/email-spam-detection-dataset-classification

In [25]:
import tensorflow_hub as hub 
import tensorflow_text as text 
import tensorflow as tf
import pandas as pd
import tf_keras

In [2]:
dataset_location = 'data/video47/spam.csv'
df = pd.read_csv(dataset_location, encoding='latin-1')

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5276,ham,Dunno leh cant remember mayb lor. So wat time ...,,,
809,ham,Ugh I don't wanna get out of bed. It's so warm.,,,
3197,ham,7 lor... Change 2 suntec... Wat time u coming?,,,
4728,ham,I've reached already.,,,
4819,spam,Check Out Choose Your Babe Videos @ sms.shsex....,,,


In [4]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [5]:
df.sample(5)

Unnamed: 0,v1,v2
4136,ham,Just got to &lt;#&gt;
2749,ham,You said not now. No problem. When you can. Le...
5156,ham,"Sir, I need Velusamy sir's date of birth and c..."
1451,ham,Ya! when are Ì_ taking ure practical lessons? ...
1696,ham,"Sorry man, my stash ran dry last night and I c..."


In [6]:
# change the column names to Category and Message
df.rename(columns={"v1" : "Category", "v2" : "Message"}, inplace=True)

In [7]:
df.sample(5)

Unnamed: 0,Category,Message
5159,ham,No but the bluray player can
5000,ham,"Nope. Since ayo travelled, he has forgotten hi..."
1153,spam,1000's of girls many local 2 u who r virgins 2...
448,ham,LOL ... Have you made plans for new years?
3644,spam,wamma get laid?want real doggin locations sent...


In [8]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [9]:
# Dataset is imbalanced 
df['spam'] = df['Category'].apply(lambda x : 1 if x == "spam" else 0)

In [10]:
df['spam'].value_counts()

spam
0    4825
1     747
Name: count, dtype: int64

# Test train split using stratify 

In [11]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, stratify=df.spam)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [13]:
y_train.value_counts()

spam
0    3859
1     598
Name: count, dtype: int64

In [14]:
y_test.value_counts()

spam
0    966
1    149
Name: count, dtype: int64

### Using BERT for text classification 

In [15]:
preprocess_url = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-12-h-768-a-12/2"

In [16]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

2025-06-24 17:28:44.008215: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-06-24 17:28:44.008237: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-06-24 17:28:44.008242: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1750766324.008254  211618 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1750766324.008272  211618 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-06-24 17:28:44.651135: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [17]:
def get_sentence_embedding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [18]:
# test the function above 
get_sentence_embedding([
    "Hurry up! Offer limited", 
    "Wake up kid, we have school to go"
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.08966986, -0.33500803,  0.94159234, ..., -0.7063119 ,
         0.17244992,  0.99913216],
       [ 0.30929735,  0.22819705,  0.4455274 , ..., -0.92773646,
         0.310882  ,  0.9950636 ]], dtype=float32)>

In [19]:
# Lets generate some more random examples and check the consine similarity for each 
e = get_sentence_embedding([
    'banana',
    'apple', 
    'orange',
    'muskmelon'
    'Jeff Bezos', 
    'Bill Gates', 
    'Elon Musk', 
    'Satya Nadela'
])

In [20]:
e

<tf.Tensor: shape=(7, 768), dtype=float32, numpy=
array([[ 0.267327  , -0.19118272,  0.08356124, ..., -0.61833537,
         0.0033634 ,  0.99978393],
       [ 0.25694746, -0.02905385,  0.7929306 , ..., -0.20954815,
         0.3358058 ,  0.99979335],
       [ 0.04899611, -0.0223359 ,  0.8952886 , ..., -0.6640402 ,
         0.06350433,  0.9991442 ],
       ...,
       [-0.196526  , -0.14199083,  0.9962983 , ..., -0.9222706 ,
         0.34902596,  0.9665003 ],
       [-0.11955259, -0.55131435,  0.9718946 , ..., -0.8813574 ,
         0.66398305,  0.97707385],
       [-0.26774007, -0.33428589,  0.92906237, ..., -0.95895755,
         0.8161329 ,  0.9751665 ]], dtype=float32)>

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# compare banana with apple
cosine_similarity([e[0]], [e[1]])

array([[0.7681571]], dtype=float32)

If value of consine_similarity is near to 1 - means they are very similar 

In [22]:
# compare apple with Satya Nadela
cosine_similarity([e[1]], [e[6]])

array([[0.3494817]], dtype=float32)

In [23]:
# consine similarity between muskmelon and Elon Musk 
cosine_similarity([e[3]], [e[5]])

array([[0.74682283]], dtype=float32)

``` 
Looks like Elon Musk is similar to muskmelon :) 

### Build model

In [26]:
# We will create functional model for this 
text_input = tf_keras.layers.Input(shape=(), dtype=tf.string, name="text")

# create embedding vectors for text layer 
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# create a drop out layer and provide 'outputs' as input to that layers
dropout = tf_keras.layers.Dropout(0.1, name="Dropout") (outputs['pooled_output']) 

# create one dense output layer with activation "sigmoid" as this is completely binary classification task 
output_layer = tf_keras.layers.Dense(1, activation="sigmoid", name="output") (dropout)

# Build the functional model 
model = tf_keras.Model(inputs=[text_input], outputs=[output_layer])

# create summary 
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [27]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [29]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x3a70dda80>

In [30]:
model.evaluate(X_test, y_test)



[0.08545655757188797, 0.9739910364151001]

In [31]:
# save this model 
model.save('output/video47/spam_identifier.keras')