In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




In [2]:
spam_df = pd.read_csv('data-files/SMSSpamCollection', sep="\t", header=None, names=['label', 'msg'])
spam_df.info()
spam_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam_df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
X, y = spam_df['msg'], spam_df['label']
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train[:3], y_train[:3]

(4281    WINNER!! As a valued network customer you have...
 585     So how's scotland. Hope you are not over showi...
 4545                  when you and derek done with class?
 Name: msg, dtype: object,
 array([1, 0, 0]))

In [6]:
tokenizer = tf_keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

In [7]:
X_train_encoded = tokenizer.texts_to_sequences(X_train)

In [8]:
print( len(tokenizer.word_index) )
tokenizer.word_index

7733


{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'me': 10,
 'my': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'call': 16,
 'have': 17,
 'on': 18,
 '2': 19,
 'are': 20,
 'now': 21,
 'that': 22,
 'not': 23,
 'but': 24,
 'or': 25,
 'so': 26,
 'do': 27,
 'be': 28,
 'can': 29,
 'will': 30,
 'at': 31,
 "i'm": 32,
 'get': 33,
 'ur': 34,
 'if': 35,
 'with': 36,
 'just': 37,
 'no': 38,
 'we': 39,
 '4': 40,
 'this': 41,
 'up': 42,
 'gt': 43,
 'lt': 44,
 'when': 45,
 'free': 46,
 'how': 47,
 'ok': 48,
 'go': 49,
 'from': 50,
 'what': 51,
 'all': 52,
 'know': 53,
 'like': 54,
 'good': 55,
 'out': 56,
 'got': 57,
 'come': 58,
 'love': 59,
 'was': 60,
 'its': 61,
 'then': 62,
 'time': 63,
 'am': 64,
 'only': 65,
 'day': 66,
 'send': 67,
 'there': 68,
 'text': 69,
 'want': 70,
 'by': 71,
 'as': 72,
 'ü': 73,
 'he': 74,
 'need': 75,
 'one': 76,
 'home': 77,
 'txt': 78,
 "i'll": 79,
 'going': 80,
 'see': 81,
 'about': 82,
 'stop': 83,
 'n': 84,
 'back':

In [9]:
len(X_train_encoded), len(X_train_encoded[0]), len(X_train_encoded[1])

(4179, 26, 17)

In [10]:
len_list = [ len(row) for row in X_train_encoded ]
max(len_list), np.mean(len_list)

(189, 16.00119645848289)

In [11]:
X_train_padded = tf_keras.preprocessing.sequence.pad_sequences(X_train_encoded, maxlen=100,
                                                               padding='post', truncating="post")

In [12]:
print( X_train_padded.shape )
X_train_padded[:3]

(4179, 100)


array([[ 671,   72,    4,  825,  398,  248,    3,   17,  108,  399,    2,
        3719, 2511,  172, 1581,    2,  137,   16, 3720,  137,  483, 3721,
         523,  894,  524,   65,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [  26,  380, 3722,  134,    3,   20,   23,  213, 1223,   13, 3723,
        3724,  109,  194,  295,    5,  895,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [13]:
vocab_size = len(tokenizer.word_index) + 1

input = tf_keras.layers.Input((None,))
x = tf_keras.layers.Embedding(input_dim=vocab_size, output_dim=32)(input)
x = tf_keras.layers.LSTM(16)(x)
output = tf_keras.layers.Dense(units=1, activation="sigmoid")(x)

model = tf_keras.models.Model(input, output)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 32)          247488    
                                                                 
 lstm (LSTM)                 (None, 16)                3136      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 250641 (979.07 KB)
Trainable params: 250641 (979.07 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy'])




In [15]:
history = model.fit(X_train_padded, y_train, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
input = tf_keras.layers.Input((None,))
output = tf_keras.layers.Embedding(input_dim=vocab_size, output_dim=32)(input)

embedding_model = tf_keras.models.Model(input, output)

In [17]:
embedding_model(X_train_padded)

<tf.Tensor: shape=(4179, 100, 32), dtype=float32, numpy=
array([[[ 0.02923935,  0.04195643, -0.04575897, ...,  0.00736161,
          0.02556309, -0.00570333],
        [-0.01142474, -0.02026942, -0.04100734, ..., -0.03283141,
          0.01498432, -0.01886067],
        [-0.00517567, -0.04185963,  0.0306734 , ..., -0.00615524,
          0.00481646,  0.00513333],
        ...,
        [-0.03629177, -0.01600213, -0.00423812, ...,  0.01141468,
         -0.03567206, -0.0049556 ],
        [-0.03629177, -0.01600213, -0.00423812, ...,  0.01141468,
         -0.03567206, -0.0049556 ],
        [-0.03629177, -0.01600213, -0.00423812, ...,  0.01141468,
         -0.03567206, -0.0049556 ]],

       [[-0.00742882,  0.0267407 , -0.01566534, ..., -0.02733575,
         -0.00515611,  0.00534327],
        [ 0.04290595,  0.00782623,  0.03514699, ..., -0.03324678,
         -0.00554562, -0.00820719],
        [ 0.03682357, -0.04413139, -0.04081484, ...,  0.01270932,
          0.04728166, -0.01288055],
        ..