# SMS Filter RNN LSTM
### data: https://www.kaggle.com/uciml/sms-spam-collection-dataset


In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as p
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Activation, Input
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline

In [56]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [109]:
data = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')

In [110]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [111]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [112]:
data.columns

Index(['v1', 'v2'], dtype='object')

In [113]:
#data.rename(columns={'v1':'Target','v2':'Input'},inplace=True)

In [114]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [115]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [116]:
X = data.v2
Y = data.v1
#Y = np.where(Y=='spam',1,0)
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [117]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y, test_size=.15)

In [154]:
X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
#Y_train.reset_index(drop=True,inplace=True)
#Y_test.reset_index(drop=True,inplace=True)


In [155]:
X_train.shape

(4736,)

In [156]:
Y_train.shape

(4736, 1)

### Tokenizer:
* "fit_on_texts" Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency. So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding. So lower integer means more frequent word (often the first few are stop words because they appear a lot).   

* "texts_to_sequences" Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.


In [281]:
max_words = 1500
max_len = 150
tok = Tokenizer(num_words=1500)
tok.fit_on_texts(X_train)
sequences=tok.texts_to_sequences(X_train)
sequence_matrix = pad_sequences(sequences,maxlen=max_len,padding="post")

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len,padding="post")

In [282]:
## Experiment:
### STARTS
tok = Tokenizer(num_words=266)
tok.fit_on_texts(X_train)
sequences=tok.texts_to_sequences(X_train)
sequence_matrix = pad_sequences(sequences,maxlen=max_len,padding="post")

In [286]:
X_train[0]

"S'fine. Anytime. All the best with it."

In [284]:
sequences[0]

[49, 5, 265, 32, 14]

In [285]:
#sequences[0]
tok.word_index.items()



In [None]:
### ENDS

In [231]:
len(tok.word_index)

8205

In [232]:
sequence_matrix.shape

(4736, 150)

#### How does keras Embedding work: 
* https://www.youtube.com/watch?v=Fuw0wv3X-0o&list=PLeo1K3hjS3uu7CxAacxVndI4bE_o3BDtO&index=40&ab_channel=codebasics
    

**Embedding Layer**
* input_dim: Integer. Size of the vocabulary, i.e. maximum integer index + 1.**
* output_dim: Integer. Dimension of the dense embedding.**
* mask_zero: Boolean, whether or not the input value 0 is a special "padding" value that should be masked out. This is useful when using recurrent layers which may take variable length input. If this is True, then all subsequent layers in the model need to support masking or an exception will be raised. If mask_zero is set to True, as a consequence, index 0 cannot be used in the vocabulary (input_dim should equal size of vocabulary + 1).
* input_length: Length of input sequences, when it is constant. This argument is required if you are going to connect Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).

**LSTM Layer**
* units: Positive integer, dimensionality of the output space.
* activation: Activation function to use. Default: hyperbolic tangent (tanh). If you pass None, no activation is applied (ie. "linear" activation: a(x) = x).
* recurrent_activation: Activation function to use for the recurrent step. Default: sigmoid (sigmoid). If you pass None, no activation is applied (ie. "linear" activation: a(x) = x).

In [258]:
inputs = Input(name='in_layer',shape=[max_len])

embedd = Embedding(input_dim=max_words,output_dim=50,input_length=max_len, mask_zero=True)(inputs)

lstm = LSTM(64,name='LSTM_1', activation=tf.keras.activations.tanh,
                              recurrent_activation=tf.keras.activations.sigmoid)(embedd)

dense = Dense(256,name='FC_1',activation=tf.keras.activations.relu)(lstm)
dropouts = tf.keras.layers.Dropout(.5)(dense)

outputs = Dense(1,name='out_layer',activation=tf.keras.activations.sigmoid)(dropouts)


In [239]:
mymodel = Model(inputs=inputs, outputs=outputs,name='SMS_LSTM')

In [240]:
mymodel.summary()

Model: "SMS_LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
in_layer (InputLayer)        [(None, 150)]             0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 150, 50)           75000     
_________________________________________________________________
LSTM_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC_1 (Dense)                 (None, 256)               16640     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
Total params: 121,337
Trainable params: 121,337
Non-trainable params: 0
____________________________________________________

In [241]:
mymodel.compile(loss=tf.keras.losses.binary_crossentropy, optimizer=tf.keras.optimizers.RMSprop(),
               metrics=['AUC'])

In [255]:
## Validation on separate test data
mymodel.fit(sequence_matrix,Y_train,batch_size=64, epochs=4,#validation_split=.15
           validation_data=[test_sequences_matrix,Y_test],
           callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss')])

Train on 4736 samples, validate on 836 samples
Epoch 1/4
Epoch 2/4


<tensorflow.python.keras.callbacks.History at 0x7fcd3e332350>

In [254]:
## Validation on some part of the training data
mymodel.fit(sequence_matrix,Y_train,batch_size=64, epochs=4,validation_split=.15,
            callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss')])
           

Train on 4025 samples, validate on 711 samples
Epoch 1/4
Epoch 2/4


<tensorflow.python.keras.callbacks.History at 0x7fcd3e7bfe90>

In [243]:
X_test.head()

0    1000's flirting NOW! Txt GIRL or BLOKE & ur NA...
1    Well. Im computerless. Time to make some oreo ...
2    URGENT! We are trying to contact U. Todays dra...
3    Oh unintentionally not bad timing. Great. Fing...
4    Bored of speed dating? Try SPEEDCHAT, txt SPEE...
Name: v2, dtype: object

In [244]:
Y_test[0:5]

array([[1],
       [0],
       [1],
       [0],
       [1]])

In [245]:
input_test = np.reshape(test_sequences_matrix[10],(-1,max_len))

In [246]:
actual_test = Y_test[10][0]

In [247]:
input_test

array([[  28,    3,  605,  107,   10,    5, 1024,  915,  207, 1006, 1405,
         183,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [248]:
print(format(mymodel.predict(input_test)[0][0],'f'))

0.003681


In [249]:
actual_test

0

In [1]:
#test_auc[1]

In [251]:
test_auc= mymodel.evaluate(test_sequences_matrix,Y_test)


