In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from keras.preprocessing.sequence import pad_sequences

In [6]:
# Load the dataset
df = pd.read_csv("input/spamorham.csv")

# Preprocessing the data

In [7]:

# "text" column contains the message text and "text_type" contains the labels (spam or not)
X = df['text']
y = df['text_type']

# Convert labels to numerical values
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to make them uniform length
max_sequence_length = 100  # Choose the maximum length of sequences
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Create model

In [8]:
# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=max_sequence_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))



Epoch 1/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.7509 - loss: 0.5084 - val_accuracy: 0.9393 - val_loss: 0.1824
Epoch 2/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9459 - loss: 0.1574 - val_accuracy: 0.9565 - val_loss: 0.1334
Epoch 3/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9695 - loss: 0.0969 - val_accuracy: 0.9509 - val_loss: 0.1553
Epoch 4/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9766 - loss: 0.0719 - val_accuracy: 0.9459 - val_loss: 0.1407
Epoch 5/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.9824 - loss: 0.0560 - val_accuracy: 0.9619 - val_loss: 0.1195
Epoch 6/10
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9842 - loss: 0.0471 - val_accuracy: 0.9413 - val_loss: 0.1623
Epoch 7/10
[1m5

# Evaluate the model

Evaluation of the model is done on the test set, which is not available to us. Therefore, we will split the training data into training and validation sets. We will use the validation set to tune the hyperparameters of the model.

In [9]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9584 - loss: 0.1476  
Test Loss: 0.1684570014476776
Test Accuracy: 0.954054057598114


Evaluate with user defined test sets

In [26]:
my_text="Congratulations! You have been selected as a winner. Text WON to 44255 to claim your prize."


In [27]:
# Example of classifying a new text_message
#new_text_message = ["Your new text_message text goes here"]
new_text_message = [my_text]
new_text_message_seq = tokenizer.texts_to_sequences(new_text_message)
new_text_message_pad = pad_sequences(new_text_message_seq, maxlen=max_sequence_length)
prediction = model.predict(new_text_message_pad)

#Print the prediction (0 = ham, 1 = spam) and print the accuracy of the model
if prediction[0][0] >= 0.5:
    print("The text message is spam.")
    #print accuracy of the model
    print("At accuracy: ", accuracy)
    #print how sure it is that the text message is spam
    print("The model is ", prediction[0][0]*100, "% sure that the text message is spam.")
    
else:
    print("The text message is ham.")
    #print accuracy of the model
    print("At accuracy: ", accuracy)
    #print how sure it is that the text message is ham
    print("The model is ", (1-prediction[0][0])*100, "% sure that the text message is not spam (ham).")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
The text message is spam.
At accuracy:  0.954054057598114
The model is  90.2423620223999 % sure that the text message is spam.
