In [1]:
!pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_text
  Downloading tensorflow_text-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.12.0


In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
import pandas as pd

df = pd.read_csv("/content/spam_or_not_spam.csv")
df.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [5]:
df.groupby("label").describe()

Unnamed: 0_level_0,email,email,email,email
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2500,2445,url URL date not supplied URL,10
1,499,427,lowest rates available for term life insurance...,5


In [6]:
df_spam = df[df['label']==1]
df_spam.shape

(500, 2)

In [7]:
df_ham = df[df['label']==0]
df_ham.shape

(2500, 2)

In [8]:
# Getting only downsampled ham class
df_ham_downsampled = df_ham.sample(df_spam.shape[0])

In [9]:
df_ham_downsampled.shape

(500, 2)

In [10]:
df_balanced = pd.concat([df_spam, df_ham_downsampled])

In [11]:
df_balanced.head(5)

Unnamed: 0,email,label
2500,save up to NUMBER on life insurance why spend...,1
2501,NUMBER fight the risk of cancer URL NUMBER sli...,1
2502,NUMBER fight the risk of cancer URL NUMBER sli...,1
2503,adult club offers free membership instant acc...,1
2504,i thought you might like these NUMBER slim dow...,1


In [12]:
df_balanced.shape

(1000, 2)

In [13]:
from sklearn.model_selection import train_test_split
# stratify is used so that training and testing set get equal proportion of spam and ham
X_train, X_test, y_train, y_test = train_test_split(df_balanced["email"], df_balanced["label"], stratify = df_balanced['label'])

In [14]:
# apis Downloading the trained models
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
bert_preprocess_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

In [16]:
# Function to convert sentences to embeddings

def get_sentence_embedding(text):
  text_preprocessed = bert_preprocess_model(text)
  return bert_model(text_preprocessed)['pooled_output']

In [17]:
get_sentence_embedding(["The english exam went well", "The maths paper was very difficult"])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.86139214, -0.48787183, -0.71162117, ..., -0.62877774,
        -0.6600917 ,  0.86798036],
       [-0.76365095, -0.23324412,  0.13299662, ...,  0.02595049,
        -0.5809271 ,  0.8095406 ]], dtype=float32)>

In [18]:
a = get_sentence_embedding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
])

In [19]:
print(a)

tf.Tensor(
[[-0.7606916  -0.14219397  0.49604574 ...  0.42165306 -0.5322139
   0.80312157]
 [-0.86023194 -0.21242912  0.4915695  ...  0.39798063 -0.60506254
   0.8447163 ]
 [-0.7128858  -0.15463883  0.38401723 ...  0.35278767 -0.50991315
   0.73474056]
 [-0.82533485 -0.35550627 -0.5906983  ... -0.01613823 -0.614176
   0.872303  ]
 [-0.75041324 -0.2681263  -0.26689714 ...  0.02839372 -0.59380966
   0.7974984 ]
 [-0.785443   -0.29949623  0.4102765  ...  0.52225506 -0.4957351
   0.81507415]], shape=(6, 768), dtype=float32)


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([a[0]], [a[3]])


array([[0.84703803]], dtype=float32)

In [40]:
# Functional model
# BERT Layers
text_input = tf.keras.layers.Input(shape=(), dtype = tf.string, name = "text")
preprocessed_output = bert_preprocess_model(text_input)
bert_results = bert_model(preprocessed_output)

# Neural Network Layers
l = tf.keras.layers.Dropout(0.1, name = 'dropout', dtype = tf.float64)(bert_results['pooled_output'])
l = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'output', dtype = tf.float64)(l)

# construct a final model
model = tf.keras.Model(inputs = [text_input], outputs = [l])

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [42]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
    tf.keras.metrics.Precision(name = 'precision'),
    tf.keras.metrics.Recall(name = 'recall')
]

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = METRICS)

In [43]:
model.fit(X_train, y_train, epochs = 10)

ValueError: ignored