<a href="https://colab.research.google.com/github/sl-93/detecting-spam-in-emails/blob/main/spam-detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install nltk
# NLTK: Natural Language tool kit
!pip install nltk

In [None]:
# install gensim
# Gensim is an open-source library for unsupervised topic modeling and natural language processing
# Gensim is implemented in Python and Cython.
!pip install gensim

In [None]:
# import key libraries
import pandas as pd
import numpy as np
import nltk
import gensim
import string
import os
string.punctuation
from sklearn.preprocessing import LabelEncoder
# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
import string
string.punctuation

In [None]:
# download stopwords
nltk.download("stopwords")

In [None]:
# Load the data
train_df = pd.read_csv('/content/drive/MyDrive/spam/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/spam/test.csv')
train_df = train_df.iloc[: , 0:2]
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
y_train = train_df['Class']

In [None]:
# define a function to remove punctuations
def remove_punc(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)

    return Test_punc_removed_join

# remove punctuations from our datasets
train_df['Text Without Punctuation'] = train_df['Text'].apply(remove_punc)
test_df['Text Without Punctuation'] = test_df['Text'].apply(remove_punc)


# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https'])

# Remove stopwords and remove short words
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words:
            result.append(token)

    return result

# apply pre-processing to the text column
train_df['Text Without Punc & Stopwords'] = train_df['Text Without Punctuation'].apply(preprocess)
test_df['Text Without Punc & Stopwords'] = test_df['Text Without Punctuation'].apply(preprocess)

# join the words into a string
train_df['Text Without Punc & Stopwords Joined'] = train_df['Text Without Punc & Stopwords'].apply(lambda x: " ".join(x))
test_df['Text Without Punc & Stopwords Joined'] = test_df['Text Without Punc & Stopwords'].apply(lambda x: " ".join(x))


nltk.download('punkt')
# Obtain the maximum length of data in the document
# This will be later used when word embeddings are generated
maxlen = -1
for doc in train_df['Text Without Punc & Stopwords Joined']:
    tokens = nltk.word_tokenize(doc)
    if(maxlen < len(tokens)):
        maxlen = len(tokens)

tweets_length = [ len(nltk.word_tokenize(x)) for x in train_df['Text Without Punc & Stopwords Joined'] ]

# Obtain the total words present in the dataset
list_of_words = []
for i in train_df['Text Without Punc & Stopwords']:
    for j in i:
        list_of_words.append(j)

# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
X_train = train_df['Text Without Punc & Stopwords']
X_test = test_df['Text Without Punc & Stopwords']

# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(X_train)

# Training data
train_sequences = tokenizer.texts_to_sequences(X_train)

# Testing data
test_sequences = tokenizer.texts_to_sequences(X_test)

# Add padding to training and testing
padded_train = pad_sequences(train_sequences, maxlen = maxlen, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences, maxlen = maxlen, padding = 'post', truncating = 'post')

In [None]:
# Convert the data to categorical 2D representation
y_train_cat = to_categorical(y_train, 2)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
clas = label_encoder.classes_

In [None]:
# Sequential Model
model = Sequential()

# embedding layer
model.add(Embedding(total_words, output_dim = 512))

#Addding Bi-directional LSTM
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dropout(0.5))
# Dense layers
model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

# Calling `save('my_model')` creates a SavedModel folder `my_model`.
#model.save("my_model")

# train the model
model.fit(padded_train, y_train_cat, batch_size = 64, validation_split = 0.2, epochs = 2)

# It can be used to reconstruct the model identically.
#reconstructed_model = keras.models.load_model("my_model")

# make prediction
# Let's check:
pred = model.predict(padded_test)

prediction = []
for i in pred:
  prediction.append(clas[np.argmax(i)])
print(prediction)