In [10]:
# importing the required libraries
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re  # Regular expression operations
from sklearn.preprocessing import LabelEncoder

In [11]:
# looading the spam dataset
data = pd.read_csv('spam.csv', encoding="latin-1")

# keeping only the neccessary columns
data = data[['v2','v1']]
data

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [12]:
# preprocessing the data 
data['v2'] = data['v2'].apply(lambda x: x.lower()) #converting to lower case
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
data

Unnamed: 0,v2,v1
0,go until jurong point crazy available only in ...,ham
1,ok lar joking wif u oni,ham
2,free entry in 2 a wkly comp to win fa cup fina...,spam
3,u dun say so early hor u c already then say,ham
4,nah i dont think he goes to usf he lives aroun...,ham
...,...,...
5567,this is the 2nd time we have tried 2 contact u...,spam
5568,will _ b going to esplanade fr home,ham
5569,pity was in mood for that soany other suggest...,ham
5570,the guy did some bitching but i acted like id ...,ham


In [13]:
max_fatures = 2000          # maximum words is 2000
# tokenization - converting the given data into sequenece of words
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)
X = tokenizer.texts_to_sequences(data['v2'].values)
X = pad_sequences(X)        # padding - add zeros to match the sentence length
X

array([[   0,    0,    0, ...,   67,   58,  137],
       [   0,    0,    0, ...,  443,    6, 1823],
       [   0,    0,    0, ...,  459,   79,  382],
       ...,
       [   0,    0,    0, ...,   12,   19,  231],
       [   0,    0,    0, ...,  198,   12,   50],
       [   0,    0,    0, ...,    1,   41,  258]], dtype=int32)

In [14]:
# model configuration, embedding layer dimension, LSTM out value
embed_dim = 128     # dimension of the Embedded layer
lstm_out = 196      # LSTM ( Long short-term memory ) layer neurons

In [15]:
# function to create the model
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    # compiling the model
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

In [16]:
# applying label encoding on target column
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])    #fitting the model
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [17]:
# creating and evalutaing the model
batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)

117/117 - 91s - loss: 0.1679 - accuracy: 0.9429
58/58 - 6s - loss: 0.0822 - accuracy: 0.9744


In [19]:
# score and accuracy Values
print(score)
print(acc)
print(model.metrics_names)

0.08219663053750992
0.9744426608085632
['loss', 'accuracy']
