# SPAM Classifier using Vanila RNN


## Get Datasets

In [6]:
import numpy as np
import pandas as pd

data = pd.read_csv("./spam.csv", encoding='latin-1')

# Drop the extra columns and rename columns
data = data.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
data.columns = ["category", "text"]
data.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing the data

In [11]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer

In [24]:
# change data type from pandas series into numpy array
text = data['text'].to_numpy()
category = data['category'].to_numpy()
spam = category == 'spam'

print("text example: ", text[0])
print("category example: ", spam[0])

text example:  Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
category example:  False


In [26]:
max_vocab = 10000
max_len = 500

tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

In [37]:
print("type : ", type(sequences)) # network expects array so we have to convert the type
print("length : ", len(sequences))

type :  <class 'list'>
length :  5572


In [42]:
from keras.preprocessing.sequence import pad_sequences

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=max_len)

In [51]:
# word_index = dict type 8920 keys
# data = array type (5572 * 500)

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, spam, stratify=spam, test_size=0.2, random_state=42)

## Network

In [55]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, SimpleRNN

embedding_mat_columns=32
model = Sequential()
model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
model.add(SimpleRNN(units=embedding_mat_columns))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.fit(X_train, y_train, epochs=10, batch_size=60, validation_split=0.2)


Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x156241450>

In [59]:
acc = model.evaluate(X_test, y_test)
print("Test loss is {0:.2f} accuracy is {1:.2f} ".format(acc[0],acc[1]))

Test loss is 0.06 accuracy is 0.99 
