In [None]:
import time

import os
import re

import pandas as pd
import numpy as np

In [None]:
# Loading Data
data = pd.read_csv("./sample_data/spam_ham_dataset.csv")

In [None]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
data = data.drop(['Unnamed: 0'],axis=1)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def preprocess_text(data):
  # Removing special characters and Numericals
  data = [re.sub(r"[^a-zA-Z]+", ' ', text) for text in data]
  # Tokenizing the words
  data = [word_tokenize(text) for text in data]
  # Removing stopwords
  data = [' '.join(list(filter(lambda x:x not in stopwords.words(),record))) for record in data] 
  return data

In [None]:
X_preprocessed = preprocess_text(X)
data['Processed Text'] = X_preprocessed

In [None]:
# Saving the processed text csv
data.to_csv("Processed.csv")

In [None]:
data = pd.read_csv("./Processed.csv")

In [None]:
X = data['text']
X_preprocessed = data['Processed Text']
Y = data['label_num']

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
num_words = 100
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

# Tokenize our data
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(X_preprocessed)

# Get our data word index
word_index = tokenizer.word_index

# Encode data sentences into sequences
sequences = tokenizer.texts_to_sequences(X_preprocessed)

# Get max sequence length
maxlen = max([len(x) for x in sequences])

# Pad the sequences
padded = pad_sequences(sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [None]:
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten
from tensorflow.keras.models import load_model

In [None]:
X_train,X_test,y_train,y_test = train_test_split(padded,Y)
X_train = np.reshape(X_train, (-1,maxlen))
X_test = np.reshape(X_test, (-1,maxlen))

In [None]:
batch_size = 24

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
val_dataset = val_dataset.batch(batch_size)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
class Model:

  model = None

  # Architecture Params
  vocab_size = 500
  embeding_dim = 16
  n_dense = 24
  n_lstm = 20

  train_accuracy = []
  val_accuracy = []


  # Predictions Dict
  preds_dict = {0:"ham",1:"spam"}

  def __init__(self):
    self.init_model()

  def init_model(self):

    # Inputs for the model
    inputs = keras.Input(shape=(maxlen,))

    # Embedding layer
    embedding = Embedding(self.vocab_size,self.embeding_dim,input_length=maxlen)(inputs)

    # LSTM Cell
    LSTM_cell = LSTM(self.n_lstm,return_sequences=True)(embedding)

    # Flatten Layer
    flatten = Flatten()(LSTM_cell)

    # Output Layer
    output = Dense(1,name='predictions')(LSTM_cell)

    # Creating the model
    self.model = keras.Model(inputs = inputs,outputs=output)

    # Optimizer and loss fn for the model
    self.optimizer = keras.optimizers.SGD(learning_rate=1e-3)
    self.loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)

    # Metrics for the model
    self.train_acc_metric = keras.metrics.BinaryAccuracy()
    self.val_acc_metric   = keras.metrics.BinaryAccuracy()


  def train_model(self,train_dataset,validation_data,epochs):
    
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))
        start_time = time.time()

        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                logits = self.model(x_batch_train, training=True)
                loss_value = self.loss_fn(y_batch_train, logits)
            
            grads = tape.gradient(loss_value, self.model.trainable_weights)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))

            # Update training metric.
            self.train_acc_metric.update_state(y_batch_train, logits)

            # Log every 100 batches.
            if step % 100 == 0:
                print(
                    "Training loss (for one batch) at step %d: %.4f"
                    % (step, float(loss_value))
                )
                print("Seen so far: %d samples" % ((step + 1) * batch_size))

        # Display metrics at the end of each epoch.
        train_acc = self.train_acc_metric.result()
        self.train_accuracy.append(train_acc)
        print("Training acc over epoch: %.4f" % (float(train_acc),))

        # Reset training metrics at the end of each epoch
        self.train_acc_metric.reset_states()

        # Run a validation loop at the end of each epoch.
        for x_batch_val, y_batch_val in validation_data:
            val_logits = self.model(x_batch_val, training=False)
            self.val_acc_metric.update_state(y_batch_val, val_logits)
        
        val_acc = self.val_acc_metric.result()
        self.val_accuracy.append(val_acc)
        self.val_acc_metric.reset_states()

        print("Validation acc: %.4f" % (float(val_acc),))
        print("Time taken: %.2fs" % (time.time() - start_time))

  def load_model(self,model_dir):
    self.model = load_model(model_dir)

  def plot_metric_curves(self):
    plt.plot(self.train_accuracy,np.arange(len(self.train_accuracy)))
    plt.plot(self.val_accuracy,np.arange(len(self.val_accuract)))
    plt.show()

  def save_model(self,model_dir):
    self.model.save(model_dir)

In [None]:
num_epochs = 3

model_inst = Model()
model_inst.train_model(train_dataset,val_dataset,num_epochs)


Start of epoch 0
Training loss (for one batch) at step 0: 1.8282
Seen so far: 24 samples
Training loss (for one batch) at step 100: 0.6867
Seen so far: 2424 samples
Training acc over epoch: 0.7143
Validation acc: 0.6976
Time taken: 792.55s

Start of epoch 1
Training loss (for one batch) at step 0: 0.6043
Seen so far: 24 samples
Training loss (for one batch) at step 100: 0.5323
Seen so far: 2424 samples
Training acc over epoch: 0.7143
Validation acc: 0.6976
Time taken: 883.84s

Start of epoch 2
Training loss (for one batch) at step 0: 0.6412
Seen so far: 24 samples
Training loss (for one batch) at step 100: 0.5279
Seen so far: 2424 samples
Training acc over epoch: 0.7143
Validation acc: 0.6976
Time taken: 845.76s


In [None]:
# Summary of the architecture
model_inst.model.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 2577)]            0         
                                                                 
 embedding_20 (Embedding)    (None, 2577, 16)          8000      
                                                                 
 lstm_21 (LSTM)              (None, 2577, 20)          2960      
                                                                 
 predictions (Dense)         (None, 2577, 1)           21        
                                                                 
Total params: 10,981
Trainable params: 10,981
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_inst.save_model("model.h5")



In [None]:
model_inst = Model()
model_inst.load_model("./model.h5")
#preds = model_inst.predict(X_test)