In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Read the data
raw_data = pd.read_csv('../Datasets/spam.csv')

raw_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data = raw_data[['v1', 'v2']]

data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Drop NaN values
data = data.dropna()

In [5]:
# Encode categorical values of v1
data = pd.get_dummies(data, prefix='is', columns=['v1'])

In [6]:
data.head()

Unnamed: 0,v2,is_ham,is_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [7]:
# is_ham and is_spam will just be complimentry
# So we can drop any 1 of them
data.drop('is_ham', axis=1, inplace=True)

In [8]:
# Rename the v2 column
data.rename({'v2': 'text'}, axis='columns', inplace=True)

In [9]:
# Reset the index. Because we have dropped null values
data = data.reset_index(drop=True)

In [10]:
data.head()

Unnamed: 0,text,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
# Separate the dependent and Indepenedent Columns
X = data['text']
y = data['is_spam']

In [12]:
print(X.shape, y.shape)

(5572,) (5572,)


In [13]:
# Text Preprocessing libraries
import nltk
import re
from nltk.corpus import stopwords

In [14]:
# Get the copy of text  messages in a list
texts = X.copy()

In [15]:
# Perform Stemming on all the texts
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# Store the stemmed sentences in corp list
corp = []

# Iterate through each text
for i in range(len(texts)):
    text = re.sub('[^a-zA-Z]', ' ', texts[i])
    text = text.lower().split()
    
    text = [stemmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    
    # Append the sentence to corp
    corp.append(text)

In [16]:
# Print the corp 
corp[:3]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli']

In [17]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [18]:
# We need to one-hot encode the text before feeding it to the LSTM

# Define the vocab size first
vocab_size = 5000


# This is map each word to a unique index
one_hot_input = [one_hot(words, vocab_size) for words in corp]

In [19]:
# Example of one_hot representation
print(one_hot_input[0])

[587, 780, 3260, 1724, 3096, 4525, 3739, 1919, 2094, 2518, 4967, 1449, 4888, 3713, 3631, 3977]


In [22]:
set_len = 20
embedded_docs = pad_sequences(one_hot_input, padding='pre', maxlen=set_len)

In [24]:
embedded_docs[0]

array([   0,    0,    0,    0,  587,  780, 3260, 1724, 3096, 4525, 3739,
       1919, 2094, 2518, 4967, 1449, 4888, 3713, 3631, 3977])

In [28]:
# Creating the model
embedding_vector_features = 50

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=set_len))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

NotImplementedError: Cannot convert a symbolic Tensor (lstm/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported