In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd "/content/drive/My Drive/Projects/spam_or_ham"

/content/drive/My Drive/Projects/spam_or_ham


In [3]:
!ls

classification_of_spam_or_ham.ipynb  smsspamcollection
preprocesses_text.txt		     wordPieceTokenizer.json


In [4]:
import pandas as pd 

In [5]:
# read or load data file 
dataset = pd.read_table("smsspamcollection/SMSSpamCollection", header=None, encoding="utf-8")

In [6]:
# To view top 5 sentences
print(f"Top 5 sentences from dataset \n{dataset.head()}")

Top 5 sentences from dataset 
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
# check is there. any null values present in the labels
print(f"Number of null values in labels    :- {dataset[0].isnull().sum()}")
print(f"Number of null values in sentences :- {dataset[1].isnull().sum()}")

Number of null values in labels    :- 0
Number of null values in sentences :- 0


In [8]:
# checking of number of sentences for each label 
print(f"Count of sentences for each label \n {dataset[0].value_counts()}")

Count of sentences for each label 
 ham     4825
spam     747
Name: 0, dtype: int64


## Preprocessing

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
# data preprocessing module
def preprocessing(labels, text):
    """
    Input :- lables and text field
    Output :- encoded labels and preprocessing text
    """
    encoder = LabelEncoder()
    Y = encoder.fit_transform(labels)
    # replace email links with email 
    processed = text.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')
    # Replace URLs with 'webaddress'
    processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')
    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = processed.str.replace(r'£|\$', 'moneysymb')
    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    # Replace numbers with 'numbr'
    processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
    # Remove punctuation
    processed = processed.str.replace(r'[^\w\d\s]', ' ')
    # Replace whitespace between terms with a single space
    processed = processed.str.replace(r'\s+', ' ')
    # Remove leading and trailing whitespace
    processed = processed.str.replace(r'^\s+|\s+?$', '')
    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = processed.str.lower()

    return Y, processed

In [11]:
labels = dataset[0]
sentences = dataset[1]

In [12]:
Y, preprocessed_sent = preprocessing(labels=labels, text=sentences)

In [15]:
preprocessed_sent.to_csv("preprocesses_text.txt", sep=" ", index=False)

In [66]:
preprocessed_sent[2]

'free entry in numbr a wkly comp to win fa cup final tkts numbrst may numbr text fa to numbr to receive entry question std txt rate t c s apply numbrovernumbr s'

## tokenizer

In [16]:
!pip install tokenizers



In [17]:
from tokenizers import BertWordPieceTokenizer

In [18]:
tokenizer = BertWordPieceTokenizer()

In [19]:
tokenizer.train("preprocesses_text.txt")

In [20]:
encoded = tokenizer.encode("I can feel the magic, can you?")

In [21]:
encoded.tokens , encoded.ids

(['i', 'can', 'feel', 'the', 'mag', '##ic', '[UNK]', 'can', 'you', '[UNK]'],
 [16, 140, 461, 83, 2701, 110, 1, 140, 71, 1])

In [22]:
# sentences to sequences of numbers 
tokenized_sent = []
for index in range(len(preprocessed_sent)):
  each_sent = preprocessed_sent[index]
  encoded = tokenizer.encode(each_sent)
  result = encoded.ids
  tokenized_sent.append(result)

In [23]:
len(tokenized_sent)

5572

In [24]:
len(tokenized_sent[0])

23

In [25]:
len_list = []
for i in tokenized_sent:
  len_list.append(len(i))

In [26]:
len_list.sort()

In [27]:
len(len_list)

5572

In [28]:
# padding 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
padded_sent = pad_sequences(sequences= tokenized_sent, padding='post',truncating='post',maxlen=200)

In [30]:
padded_sent

array([[ 108, 1140, 2370, ...,    0,    0,    0],
       [ 197,  800, 3017, ...,    0,    0,    0],
       [ 210, 1118,   93, ...,    0,    0,    0],
       ...,
       [  23,  574,  244, ...,    0,    0,    0],
       [  83, 1157,  269, ...,    0,    0,    0],
       [4563,  260, 1088, ...,    0,    0,    0]], dtype=int32)

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
train_sent, test_sent, train_target, test_target = train_test_split(padded_sent, Y, test_size=0.2, random_state=0)

In [33]:
len(train_sent), len(test_sent), len(train_target), len(test_target)

(4457, 1115, 4457, 1115)

In [34]:
tokenizer

Tokenizer(vocabulary_size=6875, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [35]:
vocab_size = 6875
embedding_dim = 100
max_length = 200

In [36]:
from tensorflow.keras import Sequential 

In [38]:
import tensorflow as tf

In [39]:
# training model
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(64,activation="relu"),
                             tf.keras.layers.Dense(32,activation="relu"),
                             tf.keras.layers.Dense(1,activation="sigmoid")
])

In [40]:
model.compile(loss = "binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 100)          687500    
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                6464      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 696,077
Trainable params: 696,077
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.fit(x=train_sent, y=train_target, batch_size=32, epochs=10, verbose=2)

Epoch 1/10
140/140 - 1s - loss: 0.4033 - accuracy: 0.8683
Epoch 2/10
140/140 - 1s - loss: 0.2582 - accuracy: 0.8905
Epoch 3/10
140/140 - 1s - loss: 0.0623 - accuracy: 0.9821
Epoch 4/10
140/140 - 1s - loss: 0.0311 - accuracy: 0.9912
Epoch 5/10
140/140 - 1s - loss: 0.0228 - accuracy: 0.9924
Epoch 6/10
140/140 - 1s - loss: 0.0147 - accuracy: 0.9960
Epoch 7/10
140/140 - 1s - loss: 0.0107 - accuracy: 0.9975
Epoch 8/10
140/140 - 1s - loss: 0.0087 - accuracy: 0.9978
Epoch 9/10
140/140 - 1s - loss: 0.0072 - accuracy: 0.9982
Epoch 10/10
140/140 - 1s - loss: 0.0065 - accuracy: 0.9989


<tensorflow.python.keras.callbacks.History at 0x7fe525a01fd0>

In [45]:
predicted_classes = model.predict_classes(test_sent)
predicted_target = []
for i in predicted_classes:
  predicted_target.append(i[0])

test_target_copy = test_target.tolist()

In [50]:
def f1_score(y_true, y_pred):
	"""
	Input :- y_true :- list of actual values
			y_pred :- list of predicted values
	Output:- float value of f1_score
	"""
	p = precision_score(y_true, y_pred)
	r = recall_score(y_true, y_pred)

	f1_score_value = 2 * p * r /(p+r)
	return f1_score_value

In [51]:
def recall_score(y_true, y_pred):
	"""
	Input :- y_true :- list of actual values
			y_pred :- list of predicted values
	Output:- float value of precision score
	"""
	tp = true_positive(y_true, y_pred)
	fn = false_negative(y_true, y_pred)
	recall_value = tp/(tp+fn)
	return recall_value

In [52]:
def precision_score(y_true, y_pred):
	"""
	Input :- y_true :- list of actual values
			y_pred :- list of predicted values
	Output:- float value of precision score
	"""
	tp = true_positive(y_true, y_pred)
	fp = false_positive(y_true, y_pred)

	precision_value = tp/(tp+fp)
	return precision_value

In [53]:
def true_positive(y_true, y_pred):
	"""
	Input :- y_true - list of actual values
			y_pred - list of predicted values
	Output :- number of true positives
	"""
	tp_counts = 0
	for true, pred in zip(y_true, y_pred):
		if true==1 and pred==1:
			tp_counts += 1
	return tp_counts

def true_negative(y_true, y_pred):
	"""
	Input :- y_true - list of actual values
			y_pred - list of predicted values
	Output :- number of true negatives
	"""
	tn_counts = 0
	for true, pred in zip(y_true, y_pred):
		if true==0 and pred==0:
			tn_counts += 1
	return tn_counts


def false_positive(y_true, y_pred):
	"""
	Input :- y_true - list of actual values
			y_pred - list of predicted values
	Output :- number of false positives
	"""
	fp_counts = 0
	for true,pred in zip(y_true, y_pred):
		if true==1 and pred==0:
			fp_counts += 1
	return fp_counts

def false_negative(y_true, y_pred):
	"""
	Input :- y_true - list of actual values
			y_pred - list of predicted values
	Output :- number of false negatives
	"""
	fn_counts = 0
	for true, pred in zip(y_true, y_pred):
		if true==0 and pred==1:
			fn_counts += 1
	return fn_counts

In [54]:
get_f1_score = f1_score(y_true=test_target_copy, y_pred=predicted_target)

In [55]:
get_f1_score

0.9648562300319489

In [79]:
# data preprocessing module
def preprocessing_test_data(text):
    """
    Input :- lables and text field
    Output :- encoded labels and preprocessing text
    """
    # encoder = LabelEncoder()
    # Y = encoder.fit_transform(labels)
    # replace email links with email 
    processed = text.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')
    # Replace URLs with 'webaddress'
    processed = processed.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')
    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = processed.replace(r'£|\$', 'moneysymb')
    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    # Replace numbers with 'numbr'
    processed = processed.replace(r'\d+(\.\d+)?', 'numbr')
    # Remove punctuation
    processed = processed.replace(r'[^\w\d\s]', ' ')
    # Replace whitespace between terms with a single space
    processed = processed.replace(r'\s+', ' ')
    # Remove leading and trailing whitespace
    processed = processed.replace(r'^\s+|\s+?$', '')
    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = processed.lower()

    return processed

In [96]:
example_text = "Our records indicate your Pension is under performing to see higher growth and up to 25% cash release reply PENSION for a free review. To opt out reply STOP"
pre = preprocessing_test_data(example_text)
result = tokenizer.encode(pre)
result = result.ids
padded_sent = pad_sequences(sequences= [result], padding='post',truncating='post',maxlen=200)

In [97]:
model.predict_classes([padded_sent])

array([[1]], dtype=int32)