This is a seq to one problem<br>
Process:<br>
1:Load, clean the data and tokenize<br>
2:Encode the sentences (Create dictionary of words, map words to integers using Keras). Keras is only used for pre-processing, to build our model we used Tensorflow <br>
3:Word embedding<br>
4:Build RNN model (Create embddings and LSTM layers)<br>
5:Run and test

In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os.path
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
from keras.preprocessing.text import Tokenizer
from bs4 import BeautifulSoup
import re
import string
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

Load and clean the messages as well as encoding the lables

In [18]:

def load_clean(filepath):
    '''Load & clean the data'''

    #Loading data
    data = pd.read_csv(filepath)
    #rows_number=data.shape[0]
    messages=[]
    for message in data['v2']:
        #Extra cleaning of text before Keras tokenization 
        #Removing stopwords
        nltk.download("stopwords")
        stop_words = set(stopwords.words('english'))
        message=' '.join(i for i in message.split() if i not in stop_words)
        #Here, BeauifulSoup is used to encode the not completely deccoded text(decoded from html code) to html code again
        message = BeautifulSoup(message, 'lxml')
        #Later we strip away tags in the html encodings and decode them to text
        message=message.get_text()
        messages.append(message)
    
    #Encoding labels
    labels=[]
    [labels.append(0) if label=="spam" else labels.append(1) for label in data['v1']]
    labels = np.asarray(labels)
    return messages,labels

Tokenize sentences and encode their words to integers, Keras is helpful here for normalization, tokenization as well as generating word indexes and padding the sentences (make sentences to have the same lengths)

In [19]:
def encode_words(sentences):
    '''Convert words to numbers (Create the dictionary of words)'''
    
    #Since we read from csv, we need to do some encoding
    #Remove u'
    sentences=[x.encode('utf-8') for x in sentences]
    #Remove \xHH characters
    sentences=[re.sub(r'[^\x00-\x7f]',r'', x) for x in sentences]
    
    #Keras tokenization (punctuation removal, normalization and split by white space)
    tokenize = Tokenizer()
    #Fit tokenizer to the whole data
    tokenize.fit_on_texts(sentences)
    data_seq=tokenize.texts_to_sequences(sentences)
    word_index = tokenize.word_index
    #Choose the maximum number of tokens in all sequences 
    num_tokens = [len(tokens) for tokens in data_seq]
    max_seq_length=np.max(num_tokens)
    #Make sequences to have the same lengths (add extra zeros to the beginnings of the sentences)
    data_seq = pad_sequences(data_seq, maxlen = max_seq_length,
                                padding='pre', truncating='pre')
    #print(data_seq)
    return data_seq,word_index

In [20]:
def create_model_inputs():
    '''Define model inputs'''
    
    #Resert the default graph 
    tf.reset_default_graph()
    #Model's placeholders for inputs
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return inputs,targets,keep_prob

In [21]:
def build_embeddings(inputs,vocabulary_size,embedding_size):
    '''Intialize embeddings for the words. Embedding layer connects the words to the LSTM layers (words are embedded to the embedding_size vectors instead of vocabulary size vectors or one hot vectors which aren't efficient). Here, we used random_uniform distribution to initialize the words' embeddings and then they are trained by the model to have more meaningful embeddings'''
    
    #Embedding Layer
    embedding = tf.Variable(tf.random_uniform((vocabulary_size, embedding_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)
    
    return embed

Create the RNN model with 2 LSTM layers

In [22]:
def build_RNN(inputs,num_hidden,lstm_layer_numbers,keep_prob,batch_size):
    '''Build RNN cells'''

    #Define LSTM layers
    lstms=[]
    for i in range(lstm_layer_numbers):
        lstms.append(tf.contrib.rnn.BasicLSTMCell(num_hidden))
    #Add regularization dropout to the LSTM cells
    drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) for lstm in lstms]
    #Stack up multiple LSTM layers
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(drops)
    #Getting the initial state
    initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
        
    return initial_state, stacked_lstm

 

In [23]:
def get_batches(x, y, batch_size):
    '''Using generator to return batches for train, validation and test data'''

    n_batches = len(x)//batch_size
    '''In case that the batch_size is not a multiple of data size in order to create batches with the same sizes, this line will ignore the last words in text that cannot create a full batch (Although one can consider those last words and add enough words from the beginning of the text to create a full size batch)'''
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [24]:
#Input data
emaildata_file="./spam.csv"

In [None]:
#Loading and cleaning the data; return clean messages and labels
text_messages,labels=load_clean(emaildata_file)

In [26]:
print(labels)

[1 1 0 ..., 1 1 1]


In [27]:
#Words to numbers
data_sequences,word_index=encode_words(text_messages)

In [28]:
word_index

{'raining': 1592,
 'yellow': 4011,
 'four': 2311,
 'prices': 6549,
 'woods': 6739,
 "friend's": 2388,
 'hanging': 1973,
 'looking': 396,
 'electricity': 3752,
 'scold': 3754,
 'lord': 6828,
 'rp176781': 4605,
 'callin': 2480,
 'ew': 6749,
 'hearin': 8343,
 'screaming': 1608,
 'disturb': 1093,
 'prize': 107,
 'andre': 8476,
 'smsing': 7980,
 'wednesday': 1274,
 'oooh': 3205,
 'specially': 1072,
 'nigh': 7532,
 'tired': 809,
 'snuggles': 8160,
 "'wnevr": 6818,
 'second': 621,
 'attended': 7746,
 'txtno': 3131,
 'available': 616,
 'scraped': 8165,
 '2kbsubject': 4899,
 'scallies': 7419,
 'errors': 5266,
 'cooking': 2231,
 'fingers': 1223,
 'maraikara': 6845,
 'hero': 5162,
 "how've": 6751,
 'y87': 6931,
 'here': 233,
 'specialise': 5727,
 '47': 7730,
 'china': 2793,
 'dogwood': 7964,
 'dorm': 3261,
 '08718711108': 4829,
 'previews': 5968,
 '84122': 5275,
 'w111wx': 2211,
 'kids': 1035,
 '84128': 2631,
 'eastenders': 3427,
 '09058091870': 7880,
 "i'd": 854,
 "i'm": 6,
 'spotty': 6044,
 'go

In [29]:
print(data_sequences)

[[   0    0    0 ...,   20 4361   98]
 [   0    0    0 ...,  422    2 1885]
 [   0    0    0 ...,  618  343 2936]
 ..., 
 [   0    0    0 ...,   33  504 8817]
 [   0    0    0 ...,  993  151   12]
 [   0    0    0 ...,   88  436  219]]


In [30]:
#Split the data into train, test and validation sets
#First split train and test parts, then split train part to train and validation parts
X_train, X_test, y_train, y_test = train_test_split(data_sequences, labels, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

Define Parameters

In [31]:
#Vocabulary size plus one for 0, the int number that added for padding
n_input = len(word_index)+1
#number of units
num_hidden = 256
lstm_layer_numbers=2
embed_size=300
batch_size= 250
learning_rate=0.001

Build and execute the graph

In [32]:
inputs,targets,keep_prob=create_model_inputs()
embeds=build_embeddings(inputs,n_input,embed_size)
initial_state, stacked_lstm_cells = build_RNN(inputs,num_hidden,lstm_layer_numbers,keep_prob,batch_size)
outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm_cells, embeds, initial_state=initial_state)
# Loss and optimizer
#second parameter: one output which indicates if the input message is spam or ham
predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid,
                                                weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                                biases_initializer=tf.zeros_initializer())

loss_function = tf.losses.mean_squared_error(targets, predictions)
optimizer = tf.train.AdadeltaOptimizer(learning_rate).minimize(loss_function)
correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))    

#Since we put both train and validation in the same session, we define initial_state for each separately
initial_state_train=initial_state
initial_state_val=initial_state
#Execute the default graph
sess = tf.Session()
saver = tf.train.Saver()
init_op = tf.initialize_all_variables()
sess.run(init_op)
no_of_batches_train = int(len(X_train)/batch_size)
no_of_batches_valid = int(len(X_val)/batch_size)
epochs = 50
for epoch in range(epochs):
    state = sess.run(initial_state_train)
    avg_cost_train = 0 
    avg_acc_train= 0
    for ii, (x, y) in enumerate(get_batches(X_train, y_train, batch_size), 1):
        _, cost, acc, state= sess.run([optimizer, loss_function,accuracy, final_state], feed_dict={inputs: x,
                                                        targets: y[:, None],keep_prob: 0.5,initial_state_train: state})
        
        avg_cost_train += cost / no_of_batches_train
        avg_acc_train += acc / no_of_batches_train
    state_val = sess.run(initial_state_val)
    avg_cost_val = 0  
    avg_acc_val = 0
    for ii, (x, y) in enumerate(get_batches(X_val, y_val, batch_size), 1):
        _, cost, acc, state_val= sess.run([optimizer, loss_function, accuracy, final_state], feed_dict={inputs: x,
                                                        targets: y[:, None],keep_prob: 1,initial_state_val: state_val})
        
        avg_cost_val += cost / no_of_batches_valid
        avg_acc_val += acc / no_of_batches_valid
    print("Epoch:", epoch+1, "cost_train=", avg_cost_train, "cost_val=", avg_cost_val)
    print("acc_train=", avg_acc_train, "acc_val=", avg_acc_val) 
#Save the model into a file 
checkpoint="./model/savedmodel.ckpt"
save_path = saver.save(sess, checkpoint)
sess.close()

('Epoch:', 1, 'cost_train=', 0.24554018889154705, 'cost_val=', 0.24357138574123383)
('acc_train=', 0.51861050086362026, 'acc_val=', 0.56733236710230517)
('Epoch:', 2, 'cost_train=', 0.24271440931728908, 'cost_val=', 0.24055684109528858)
('acc_train=', 0.53575725214821945, 'acc_val=', 0.59855898221333825)
('Epoch:', 3, 'cost_train=', 0.24021922264780318, 'cost_val=', 0.23756549259026843)
('acc_train=', 0.56210471051079891, 'acc_val=', 0.61807562907536828)
('Epoch:', 4, 'cost_train=', 0.2359522304364613, 'cost_val=', 0.23460867007573449)
('acc_train=', 0.58719751664570397, 'acc_val=', 0.64051973819732666)
('Epoch:', 5, 'cost_train=', 0.23432466813496178, 'cost_val=', 0.23166404167811078)
('acc_train=', 0.59472536614962979, 'acc_val=', 0.65808471043904626)
('Epoch:', 6, 'cost_train=', 0.23178069187062125, 'cost_val=', 0.22874727348486579)
('acc_train=', 0.61040836998394554, 'acc_val=', 0.67272218068440759)
('Epoch:', 7, 'cost_train=', 0.22912079521587916, 'cost_val=', 0.22583048542340595)

In [33]:
#Test the saved model
no_of_batches_test = int(len(X_test)/batch_size)
sess = tf.Session()
#Load the model
saver = tf.train.Saver()
saver.restore(sess, checkpoint)
state_test = sess.run(initial_state)
avg_cost_test = 0  
avg_acc_test = 0  
for ii, (x, y) in enumerate(get_batches(X_test, y_test, batch_size), 1):
    _, cost, acc, state_test = sess.run([optimizer, loss_function, accuracy, final_state], feed_dict={inputs: x,
                                                    targets: y[:, None],keep_prob: 1,initial_state: state_test})
    avg_cost_test += cost / no_of_batches_test
    avg_acc_test += acc / no_of_batches_test
print("Test loss",avg_cost_test) 
print("Test Accuracy",avg_acc_test)
sess.close()

INFO:tensorflow:Restoring parameters from ./model/savedmodel.ckpt
('Test loss', 0.12515557184815407)
('Test Accuracy', 0.84251686930656433)
