In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [3]:
# Mount Data from Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Get the Encoding format of the file so that it can be read correctly
import chardet

file = "/content/drive/MyDrive/Datasets/spam.csv"

with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.7270322499829184, 'encoding': 'Windows-1252', 'language': ''}

In [9]:
# Read the data
raw_data = pd.read_csv('/content/drive/MyDrive/Datasets/spam.csv', encoding='Windows-1252')

raw_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
data = raw_data[['v1', 'v2']]

data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Drop NaN values
data = data.dropna()

In [12]:
# Encode categorical values of v1
data = pd.get_dummies(data, prefix='is', columns=['v1'])

In [13]:
data.head()

Unnamed: 0,v2,is_ham,is_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [14]:
# is_ham and is_spam will just be complimentry
# So we can drop any 1 of them
data.drop('is_ham', axis=1, inplace=True)

In [15]:
# Rename the v2 column
data.rename({'v2': 'text'}, axis='columns', inplace=True)

In [16]:
# Reset the index. Because we have dropped null values
data = data.reset_index(drop=True)

In [17]:
data.head()

Unnamed: 0,text,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [18]:
# Separate the dependent and Indepenedent Columns
X = data['text']
y = data['is_spam']

In [19]:
print(X.shape, y.shape)

(5572,) (5572,)


In [23]:
# Text Preprocessing libraries
import nltk

# Download stopwords 
nltk.download('stopwords')

import re
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
# Get the copy of text  messages in a list
texts = X.copy()

In [25]:
# Perform Stemming on all the texts
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# Store the stemmed sentences in corp list
corp = []

# Iterate through each text
for i in range(len(texts)):
    text = re.sub('[^a-zA-Z]', ' ', texts[i])
    text = text.lower().split()
    
    text = [stemmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    
    # Append the sentence to corp
    corp.append(text)

In [26]:
# Print the corp 
corp[:3]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli']

In [27]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [28]:
# We need to one-hot encode the text before feeding it to the LSTM

# Define the vocab size first
vocab_size = 5000


# This is map each word to a unique index
one_hot_input = [one_hot(words, vocab_size) for words in corp]

In [29]:
# Example of one_hot representation
print(one_hot_input[0])

[3972, 666, 569, 3753, 709, 4164, 1974, 4141, 2707, 1091, 551, 3674, 3279, 790, 1080, 346]


In [30]:
set_len = 20
embedded_docs = pad_sequences(one_hot_input, padding='pre', maxlen=set_len)

In [31]:
embedded_docs[0]

array([   0,    0,    0,    0, 3972,  666,  569, 3753,  709, 4164, 1974,
       4141, 2707, 1091,  551, 3674, 3279,  790, 1080,  346], dtype=int32)

In [32]:
# Creating the model
embedding_vector_features = 50

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=set_len))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 50)            250000    
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 310,501
Trainable params: 310,501
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
len(embedded_docs), y.shape

(5572, (5572,))

In [34]:
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [35]:
X_final.shape, y_final.shape

((5572, 20), (5572,))

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=0)

In [40]:
# Train the model on X_train
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8f21b52f50>

In [44]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=set_len))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [50]:
y_preds = model.predict(X_test)

In [51]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [53]:
accuracy_score(y_test, y_preds)

0.8576555023923444