In [3]:
import tensorflow_hub as hub
import pandas as pd
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [2]:
!pip3 install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.1 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.6.0


In [4]:
# load data
df = pd.read_csv('spam_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# check count and unique and top values and their frequency
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


In [7]:
# creating 2 new dataframe as df_ham , df_spam

df_spam = df[df['Category']=='spam']

df_ham = df[df['Category']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 2)
Spam Dataset Shape: (747, 2)


In [9]:
# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [10]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])

In [11]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [12]:
df_balanced.sample(10)

Unnamed: 0,Category,Message
4208,ham,"Lets use it next week, princess :)"
5268,ham,"ER, ENJOYIN INDIANS AT THE MO..yeP. SaLL gOoD ..."
4806,ham,Sorry for the delay. Yes masters
5112,spam,December only! Had your mobile 11mths+? You ar...
5312,ham,Here got ur favorite oyster... N got my favori...
1304,ham,I cant pick the phone right now. Pls send a me...
160,spam,You are a winner U have been specially selecte...
3028,ham,You still at the game?
420,spam,Send a logo 2 ur lover - 2 names joined by a h...
3138,ham,You're right I have now that I think about it


In [13]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [14]:
# displaying data - spam -1 , ham-0
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
2124,spam,#ERROR!,1
5156,ham,"Sir, I need Velusamy sir's date of birth and c...",0
15,spam,"XXXMobileMovieClub: To use your credit, click ...",1
3556,spam,From next month get upto 50% More Calls 4 Ur s...,1


In [15]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

Model Creation

In [16]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [17]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)

In [18]:
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [19]:
# check the summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs (InputLayer)             [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_word_ids': ( 0           Inputs[0][0]                     
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'encoder_outputs':  109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
______________________________________________________________________________________________

In [20]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]


In [21]:
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [22]:
history = model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Model Evaluation

In [None]:
# Evaluating performance
model.evaluate(X_test,y_test)

In [None]:
# getting y_pred by predicting over X_text and flattening it
y_pred = model.predict(X_test)
y_pred = y_pred.flatten() # require to be in one-dimensional array , for easy manipulation
# importing confusion maxtrix

from sklearn.metrics import confusion_matrix , classification_report

# creating confusion matrix 

cm = confusion_matrix(y_test,y_pred)

cm

In [None]:
# plotting as a graph - importing seaborn
import seaborn as sns

In [None]:
# creating a graph out of confusion matrix
sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
# printing classification report
print(classification_report(y_test , y_pred))

Model Prediction for Spam Detection Using BERT

In [None]:
predict_text = [
                # Spam
                'We’d all like to get a $10,000 deposit on our bank accounts out of the blue, but winning a prize—especially if you’ve never entered a contest', 
                'Netflix is sending you a refund of $12.99. Please reply with your bank account and routing number to verify and get your refund', 
                'Your account is temporarily frozen. Please log in to to secure your account ', 
                #ham
                'The article was published on 18th August itself',
                'Although we are unable to give you an exact time-frame at the moment, I would request you to stay tuned for any updates.',
                'The image you sent is a UI bug, I can check that your article is marked as regular and is not in the monetization program.'
]

In [None]:
test_results = model.predict(predict_text)
output = np.where(test_results>0.5,'spam', 'ham')