In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
import tf_keras

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
df = pd.read_csv("/content/drive/MyDrive/datasets/spam.csv")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
class_count_ham = df.Category.value_counts()[0]
class_count_spam = df.Category.value_counts()[1]

  class_count_ham = df.Category.value_counts()[0]
  class_count_spam = df.Category.value_counts()[1]


In [6]:
df_class_ham = df[df['Category']=='ham']
df_class_ham = df_class_ham.sample(class_count_spam)
df_class_spam = df[df['Category']=='spam']

In [7]:
len(df_class_spam) , len(df_class_ham)

(747, 747)

In [8]:
final_df = pd.concat([df_class_spam,df_class_ham])

In [9]:
final_df.shape

(1494, 2)

In [10]:
final_df.Category = final_df.Category.apply(lambda x: 1 if x=='spam' else 0)

In [11]:
final_df.head()

Unnamed: 0,Category,Message
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [12]:
final_df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [13]:
X = final_df['Message']
y = final_df['Category']
len(X), len(y)

(1494, 1494)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [15]:
preprocess_url = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-10-h-128-a-2/2"

In [16]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [17]:
def get_sentence_embedding(sentences):
    text_preprocessed = bert_preprocess(sentences)
    text_encoded = bert_encoder(text_preprocessed)
    return text_encoded['pooled_output']

In [18]:
get_sentence_embedding(['hello bro, how are you?','you like me?'])

<tf.Tensor: shape=(2, 128), dtype=float32, numpy=
array([[-9.87030447e-01,  9.86893415e-01, -4.60356995e-02,
        -9.71112847e-01, -9.93814170e-01, -5.12698948e-01,
         3.97331625e-01,  1.04166619e-01,  9.86945629e-01,
        -5.61527014e-01, -8.84451449e-01,  8.12164545e-01,
         3.41594696e-01,  9.95560646e-01,  8.22806284e-02,
         8.75274241e-01, -9.99291182e-01, -1.05240852e-01,
        -8.22569788e-01,  9.33377147e-01, -8.49476814e-01,
        -9.37787056e-01, -5.15889645e-01,  8.61470282e-01,
        -3.53011876e-01,  8.06279242e-01, -8.11575472e-01,
        -9.85472500e-01,  3.05210173e-01, -5.22398762e-02,
         9.90794241e-01,  9.57282543e-01,  9.65332568e-01,
         9.46516454e-01,  8.31052244e-01, -5.08947372e-02,
        -9.74445164e-01, -6.19870365e-01,  4.81234878e-01,
         1.79707214e-01, -9.78286326e-01, -9.83252525e-01,
         1.41831324e-01,  9.99295533e-01, -5.45327246e-01,
        -4.82647777e-01, -7.57980168e-01, -9.72739100e-01,
      

In [19]:
e = get_sentence_embedding([
    'banana',
    'grapes',
    'mango',
    'jeff bezos',
    'elon musk',
    'bill gates',
])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
cosine_similarity([e[0]],[e[1]])

array([[0.93436116]], dtype=float32)

In [22]:
cosine_similarity([e[0]],[e[4]])

array([[0.8862586]], dtype=float32)

# **Functional Model**

In [23]:
# Bert Layers
text_input= tf_keras.layers.Input(shape=(), dtype=tf.string, name="text")
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural Network Layers
l = tf_keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
l = tf_keras.layers.Dense(1, activation='sigmoid', name= "output")(l)

# Construct final Model
model = tf_keras.Model(inputs=[text_input], outputs=[l])

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [25]:
METRICS = [
    tf_keras.metrics.BinaryAccuracy(name='accuracy'),
    tf_keras.metrics.Precision(name='precision'),
    tf_keras.metrics.Recall(name='recall'),
]

model.compile(optimizer = 'adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [26]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x79e45248a010>

In [30]:
model.evaluate(X_test, y_test)



[0.33707496523857117,
 0.8561872839927673,
 0.831250011920929,
 0.8926174640655518]

In [28]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()



In [31]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5,1,0)
y_predicted

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1])

In [32]:
from sklearn.metrics import confusion_matrix, classification_report

cm=confusion_matrix(y_test, y_predicted)
cm

array([[123,  27],
       [ 16, 133]])

In [33]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       150
           1       0.83      0.89      0.86       149

    accuracy                           0.86       299
   macro avg       0.86      0.86      0.86       299
weighted avg       0.86      0.86      0.86       299



In [34]:
reviews = [
    "I absolutely loved the product! Will definitely buy again.",
    "Terrible customer service. I had to wait two weeks for a response.",
    "The quality is decent for the price. Not amazing, but not bad either.",
    "Excellent experience! Fast shipping and great packaging.",
    "I’m disappointed. The item arrived damaged and didn't match the description."
]


In [35]:
model.predict(reviews)



array([[0.03482039],
       [0.06927659],
       [0.33751702],
       [0.5218507 ],
       [0.03508133]], dtype=float32)