In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./../dataset/spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sentences = df['Message'].tolist()
labels = df['Category'].tolist()

In [5]:
train_size = int(len(sentences)*0.9)

In [6]:
train_size

5014

In [7]:
training_sentences = sentences[0:train_size]
testing_sentences = sentences[train_size:]

In [8]:
training_labels = labels[0:train_size]
testing_labels = labels[train_size:]

In [9]:
import numpy as np

In [10]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

Vocab is accumulation of words

In [11]:
vocab = 500
embedding_size = 32
max_length = 50
truncation_type = 'post'
padding_type = 'post'
out_of_vocab_token = '<OOV>'

### Tokenizer

In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
tokenizer = Tokenizer(num_words=vocab, oov_token=out_of_vocab_token)

In [14]:
tokenizer.fit_on_texts(training_sentences)

In [15]:
word_index = tokenizer.word_index

In [16]:
# word_index

In [17]:
sequences = tokenizer.texts_to_sequences(training_sentences)

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
padded = pad_sequences(sequences=sequences, maxlen=max_length, padding=padding_type, truncating=truncation_type)

In [20]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [21]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [23]:
model = Sequential()

In [25]:
model.add(Embedding(vocab, embedding_size))
model.add(LSTM(20))
model.add(Dense(1, activation='sigmoid'))

In [26]:
from tensorflow.keras.optimizers import Adam

In [27]:
adam = Adam()

In [28]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [29]:
model.summary()

In [30]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, truncating=truncation_type, maxlen=max_length)

In [31]:
model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8669 - loss: 0.4576 - val_accuracy: 0.8705 - val_loss: 0.3861
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8611 - loss: 0.4040 - val_accuracy: 0.8705 - val_loss: 0.3857
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8588 - loss: 0.4075 - val_accuracy: 0.8705 - val_loss: 0.3847
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8716 - loss: 0.3543 - val_accuracy: 0.9266 - val_loss: 0.2791
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9270 - loss: 0.2453 - val_accuracy: 0.9809 - val_loss: 0.0899
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9660 - loss: 0.1163 - val_accuracy: 0.9549 - val_loss: 0.1220
Epoch 7/10
[1m157/157[0m 

<keras.src.callbacks.history.History at 0x279b62b8b50>

In [38]:
model.save('./../savedModels/model.h5')

In [39]:
import pickle

In [41]:
with open('./../savedModels/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
import io, json

In [44]:
tokenizer_json = tokenizer.to_json()
with io.open('./../savedModels/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))