In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
!pip install tensorflow_text



In [5]:
import pandas as pd
df = pd.read_csv("/content/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


## From this we can conclude it is an Imbalanced Dataset

In [8]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [9]:
df_ham = df[df['Category']=='ham']
df.shape

(5572, 2)

In [10]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [11]:
df_ham_downsampled.head()

Unnamed: 0,Category,Message
431,ham,At home watching tv lor.
2139,ham,K still are you loving me.
4253,ham,"Send ur birthdate with month and year, I will ..."
2922,ham,"Yo, any way we could pick something up tonight?"
2210,ham,"Hmm well, night night"


In [12]:
df_balanced = pd.concat([df_spam,df_ham_downsampled])
df_balanced.shape

(1494, 2)

In [13]:
df_balanced['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
spam,747
ham,747


In [14]:
df_balanced.sample(5)

Unnamed: 0,Category,Message
3080,ham,*deep sigh* ... I miss you :-( ... I am really...
4905,ham,"no, i *didn't* mean to post it. I wrote it, an..."
3101,ham,Tessy..pls do me a favor. Pls convey my birthd...
1126,spam,For taking part in our mobile survey yesterday...
3341,ham,Like I made him throw up when we were smoking ...


In [15]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [16]:
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
2871,spam,YOUR CHANCE TO BE ON A REALITY FANTASY SHOW ca...,1
4949,spam,"Hi this is Amy, we will be sending you a free ...",1
4784,ham,Especially since i talk about boston all up in...,0
943,spam,How about getting in touch with folks waiting ...,1
313,ham,He says he'll give me a call when his friend's...,0


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'],stratify=df_balanced['spam'])

In [18]:
X_train.head()

Unnamed: 0,Message
5102,This msg is for your mobile content order It h...
1315,Got but got 2 colours lor. One colour is quite...
4272,Natalja (25/F) is inviting you to be her frien...
1699,"Free msg. Sorry, a service you ordered from 81..."
1933,Jus finished avatar nigro


In [19]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [20]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "1000$ discount. hurry up",
    "Sai, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8504603 , -0.5165808 , -0.93070894, ..., -0.8267264 ,
        -0.7626391 ,  0.9219375 ],
       [-0.8843823 , -0.46849903, -0.9316683 , ..., -0.8196679 ,
        -0.6971991 ,  0.9193893 ]], dtype=float32)>

In [21]:
e = get_sentence_embeding([
    "banana",
    "apples" ,
    "grapes",
    "mango",
    "jeff bezos",
    "sai",
    "bill gates"
]
)

In [22]:
e

<tf.Tensor: shape=(7, 768), dtype=float32, numpy=
array([[-0.760692  , -0.14219385,  0.49604586, ...,  0.42165312,
        -0.532214  ,  0.8031219 ],
       [-0.80685544, -0.18719652,  0.37594944, ...,  0.34031105,
        -0.6245898 ,  0.86208797],
       [-0.86023223, -0.2124296 ,  0.49156848, ...,  0.39797997,
        -0.60506296,  0.8447167 ],
       ...,
       [-0.82533467, -0.35550597, -0.5906975 , ..., -0.01613778,
        -0.61417586,  0.8723029 ],
       [-0.85128295, -0.18843201,  0.49178576, ...,  0.26010442,
        -0.58423036,  0.84162295],
       [-0.78544426, -0.29949754,  0.41027156, ...,  0.52225274,
        -0.49573594,  0.8150757 ]], dtype=float32)>

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])

array([[0.97732115]], dtype=float32)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[5]],[e[6]])

array([[0.95486915]], dtype=float32)