<a href="https://colab.research.google.com/github/projjal1/Neural_Networks_Projects/blob/master/Spam_Ham_Sms_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Building a model to classify Spam and Ham emails
This model will use embeddings and tokenizers to build a neural network model that can classify email messages to spam and ham category. 

In [51]:
%pip install tensorflow-gpu



In [52]:
import tensorflow as tf
import numpy as np
from tensorflow import keras

In [54]:
file_url='https://raw.githubusercontent.com/projjal1/datasets/master/spam_ham_dataset.txt'
file=keras.utils.get_file('spam_ham_dataset.txt',file_url)

In [55]:
class_names={'spam':0,'ham':1}
idx2_class_names={0:'spam',1:'ham'}

In [56]:
def load_data():
    """
    Loads dataset
    """
    texts, labels = [], []
    with open(file) as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [57]:
texts,labels=load_data()

In [58]:
for i in range(2):
  print(labels[i],texts[i])

ham Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham Ok lar... Joking wif u oni...


In [59]:
#Now let us tokenize the text
tokenizer=keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)

#Convert text sequence to integer
texts=tokenizer.texts_to_sequences(texts)

In [60]:
print(texts[0])

[49, 471, 4435, 842, 755, 658, 64, 8, 1327, 88, 123, 351, 1328, 148, 2996, 1329, 67, 58, 4436, 144]


Now let's define some hyperparameters to develop the model and data

In [76]:
SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set
BATCH_SIZE=64 #batch size for data

In [62]:
#Now lets convert both the labels and texts to numpy 
texts=np.array(texts)
labels=np.array(labels)

#Now we need to pad the texts to make it of uniform size
texts=keras.preprocessing.sequence.pad_sequences(texts,maxlen=SEQUENCE_LENGTH)

In [63]:
#Now lets one-hot encode the labels 

#First we convert the string labels to integer ids
labels=[class_names[x] for x in labels]

#Now lets categorize labels
labels=keras.utils.to_categorical(labels)

In [64]:
labels[:5]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [65]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(texts,labels,test_size=TEST_SIZE,random_state=7)

Here we will be using pretrained weights from models to train our model

In [66]:
import tqdm

In [68]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [73]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(keras.layers.LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy"])
    model.summary()
    return model

In [74]:
# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading GloVe: 215006it [00:07, 28434.60it/s]






Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          901000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
Total params: 1,018,506
Trainable params: 117,506
Non-trainable params: 901,000
_________________________________________________________________


In [77]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test),
          batch_size=BATCH_SIZE, epochs=20,
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f36d4bca160>

Now after model is trained we start to predict the labels of input text

In [80]:
def get_predictions(text):
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = keras.preprocessing.sequence.pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    return idx2_class_names[np.argmax(prediction)]

In [81]:
text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
print(get_predictions(text))

spam


In [84]:
text = "“Congratulations! Your mobile number has won the sum of $1,000,000 in our Atlantic Mobile Lotto. Contact us via email on [address removed] for claim."
print(get_predictions(text))

spam


In [85]:
text = "Hello I am Maya. Just texted you to keep you updated."
print(get_predictions(text))

ham
