In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.layers import LSTM,Dense,Dropout,Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [None]:
data = pd.read_csv("/content/spam.csv",encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data.iloc[0].v2

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
data.info()

In [None]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
x = data['v2']
y = data['v1']
le = LabelEncoder()
y = le.fit_transform(y)
y = y.reshape(-1,1)  ## convert y to 2d array (not mandatory)
y

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.85,random_state=7)

In [None]:
xtrain[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
tok = Tokenizer(num_words=1000)
tok.fit_on_texts(xtrain)
sequences = tok.texts_to_sequences(xtrain)
sequences_mat = pad_sequences(sequences,maxlen=150,padding="pre")
sequences_mat

array([[  0,   0,   0, ...,   0, 155, 447],
       [  0,   0,   0, ...,   2,  28, 113],
       [  0,   0,   0, ...,  38,  44,   4],
       ...,
       [  0,   0,   0, ...,  52,   2,  28],
       [  0,   0,   0, ..., 193, 249,  97],
       [  0,   0,   0, ...,   2,  31, 531]], dtype=int32)

In [None]:
sequences[0]

[155, 447]

In [None]:
sequences_mat[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0, 155, 447], dtype=int32)

In [None]:
sequences_mat.shape

(4736, 150)

In [None]:
tok.word_index

In [None]:
from keras.models import Sequential
model = Sequential()
model.add(Embedding(1000,10,input_length=150))
model.add(LSTM(64))
model.add(Dense(256,activation='relu'))  ## hidden layer
model.add(Dropout(0.5))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(sequences_mat,ytrain,batch_size=64,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7dde499a78e0>

In [None]:
text_sequences = tok.texts_to_sequences(xtest)
test_sequences_mat = pad_sequences(text_sequences,maxlen=150,padding="pre")

In [None]:
model.evaluate(test_sequences_mat,ytest)



[0.063572458922863, 0.9868420958518982]

In [None]:
ypred = model.predict(test_sequences_mat)
ypred = ypred.round()
ypred

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(ytest,ypred)
ac = accuracy_score(ytest,ypred)
print(ac)
print(cm)

0.9868421052631579
[[719   4]
 [  7 106]]


In [None]:
### Test a new sample
txt = ["You got a 50% discount and free of cost hampers in every purchase",
       "Congratulations, you are selected for the applied job"]
txt = tok.texts_to_sequences(txt)
txt = pad_sequences(txt,maxlen=150,padding="pre")
model.predict(txt).round()



array([[1.],
       [0.]], dtype=float32)