In [106]:
!pip install tensorflow_text



In [107]:
!pip install tensorflow_hub



In [108]:
!pip install tensorflow



In [110]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

## creating dataset

In [111]:
df = pd.read_csv("/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [112]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [113]:
 df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [2]:
747/(4825+747)
#13% spam emails, 85% ham emails: This indicates class imbalance
#15% are spam emails there's some imbalance
#we can use dwonsamplling technique to balance the dataset. in here we pick random 747 data from 4825
#this might be not good when it comes to training cause we decrese,loosing the training data here
#there are other approaches like smort , oversampling (oversample the minority class)

0.13406317300789664

In [115]:
#create seperate dataframes for ham and spam emails
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [116]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [117]:
df_ham.sample(747)

Unnamed: 0,Category,Message
3071,ham,I'm now but have to wait till 2 for the bus to...
2906,ham,Ha. You don‘t know either. I did a a clever bu...
1595,ham,Never blame a day in ur life. Good days give u...
5531,ham,Compliments to you. Was away from the system. ...
1115,ham,No no:)this is kallis home ground.amla home to...
...,...,...
5084,ham,Hey happy birthday...
3627,ham,Yeah right! I'll bring my tape measure fri!
5324,ham,"Dear Sir,Salam Alaikkum.Pride and Pleasure mee..."
4526,ham,Cos i was out shopping wif darren jus now n i ...


In [118]:
df_ham.sample(df_spam.shape[0])

Unnamed: 0,Category,Message
2425,ham,Oh k k:)but he is not a big hitter.anyway good
5509,ham,Lol they were mad at first but then they woke ...
4031,ham,"Cool, I'll text you in a few"
1785,ham,"Dont search love, let love find U. Thats why i..."
2949,ham,Nope but i'll b going 2 sch on fri quite early...
...,...,...
1939,ham,Excellent! Are you ready to moan and scream in...
1145,ham,Really... I tot ur paper ended long ago... But...
399,ham,"Good evening Sir, Al Salam Wahleykkum.sharing ..."
1986,ham,The length is e same but e top shorter n i got...


In [119]:
#downsamplling the ham dtaframe (clean emails,not spam)
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [120]:
#connect 2 dataframes
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [121]:
df_balanced['Category'].value_counts()
#this might not good cause we are loosing valuble training dataset
#this fis or learn BERT vector embeddings and classfication

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,747
spam,747


In [122]:
df_balanced.sample(5)

Unnamed: 0,Category,Message
2208,ham,Usually the body takes care of it buy making s...
4296,spam,HMV BONUS SPECIAL 500 pounds of genuine HMV vo...
2420,spam,SMS SERVICES For your inclusive text credits p...
106,ham,Thanks a lot for your wishes on my birthday. T...
1136,ham,K do I need a login or anything


In [123]:
#create new spam column if spam=1 not sapm=0
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
734,ham,Even u dont get in trouble while convincing..j...,0
2619,ham,Hey pple...$700 or $900 for 5 nights...Excelle...,0
1122,spam,Do you want 750 anytime any network mins 150 t...,1
312,spam,Think ur smart ? Win £200 this week in our wee...,1
3864,spam,Oh my god! I've found your number again! I'm s...,1


## Split it into training and test data set

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [126]:
X_train.head()

Unnamed: 0,Message
68,"Did you hear about the new ""Divorce Barbie""? I..."
1227,Reply with your name and address and YOU WILL ...
1944,I got lousy sleep. I kept waking up every 2 ho...
4407,As one of our registered subscribers u can ent...
4574,URGENT! This is the 2nd attempt to contact U!U...


## import BERT model and get embeding vectors for few sample statements

In [127]:
#get trained models from TF Hub for preprocess and encode
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [128]:
def get_sentence_embedding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embedding([
    "500$ discount. hurry up",
    "kasun, are you intresetd in winnig new iphone 16?"]
)
#enoding for the sentences are return as outputs generates from above BERT models

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435169 , -0.5132726 , -0.8884571 , ..., -0.74748844,
        -0.7531473 ,  0.91964495],
       [-0.7464012 , -0.35622522, -0.9066411 , ..., -0.8302861 ,
        -0.60930175,  0.76641387]], dtype=float32)>

### Get embeding vectors for few sample words. Compare them using cosine similarity

In [129]:
e = get_sentence_embedding([
    "phone",
    "camera",
    "grapes",
    "mango",
    "cash price",
    "money",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)
#check cosine similarity for some word/embeddings and find out benifit of having BERT encoding

In [130]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])
#from cosine simillarity we can identify how simillar two vectors are. if those 2 vectors points to same direction or not

array([[0.89605117]], dtype=float32)

Values near to 1 means they are similar. 0 means they are very different.
Above you can use comparing "phone" vs "camera" you get 0.89 similarity as they both are electronics

In [131]:
cosine_similarity([e[2]],[e[3]])

array([[0.98507446]], dtype=float32)

Above you can use comparing "mango" vs "grapes" you get 0.98 similarity as they both are fruits

In [132]:
cosine_similarity([e[4]],[e[5]])

array([[0.9225594]], dtype=float32)

Above you can use comparing "cash price" vs "money" you get 0.92 similarity as they both are same valuble items

In [133]:
cosine_similarity([e[2]],[e[6]])

array([[0.87739766]], dtype=float32)

Comparing grapes with jeff bezos you still get 0.87 but it is not as close as 0.98 that we got with mangos

In [134]:
cosine_similarity([e[6]],[e[7]])

array([[0.98720354]], dtype=float32)

Jeff bezos and Elon musk are more similar you get 0.98 then Jeff bezos and grapes as indicated above

# build model

There are two types of models you can build in tensorflow.

(1) Sequential
(2) Functional

So far we have built sequential model. But below we will build functional model. More information on these two is here: https://becominghuman.ai/sequential-vs-functional-model-in-keras-20684f766057

In [135]:
from tensorflow.keras import layers

In [148]:
sample_text = ["This is a test sentence"]
preprocessed_text = bert_preprocess(tf.constant(sample_text))

In [149]:
pip show tensorflow tensorflow-hub

Name: tensorflow
Version: 2.17.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, requests, setuptools, six, tensorboard, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: dopamine_rl, tensorflow-text, tf_keras
---
Name: tensorflow-hub
Version: 0.16.1
Summary: TensorFlow Hub is a library to foster the publication, discovery, and consumption of reusable parts of machine learning models.
Home-page: https://github.com/tensorflow/hub
Author: Google LLC
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, protobuf, tf-keras
Required-by: 


In [154]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

ValueError: Exception encountered when calling layer 'bert_preprocess' (type KerasLayer).

A KerasTensor is symbolic: it's a placeholder for a shape an a dtype. It doesn't have any actual numerical value. You cannot convert it to a NumPy array.

Call arguments received by layer 'bert_preprocess' (type KerasLayer):
  • inputs=<KerasTensor shape=(None,), dtype=string, sparse=None, name=text>
  • training=None