In [None]:
import pandas as pd
import numpy as np

dataset link: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

Importing dataset

In [None]:
raw = pd.read_csv("SMSSpamCollection", sep='\t',header=None, names=['isSpam', 'message'])

In [None]:
raw.head(10)

Unnamed: 0,isSpam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
len(raw)

5572

remove duplicates

In [None]:

raw.drop_duplicates(subset=['message'], inplace=True)
len(raw)


5169

Data cleaning

In [None]:
import string
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

remove punctuation and stopwords

In [None]:
raw['processed'] = raw['message'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stopwords and word not in punctuation]))

In [None]:

raw['processed'] = raw['processed'].apply(lambda x: x.translate(str.maketrans('', '', punctuation)))

#or
# def pre_process(sms):
#     remove_punct = "".join([word.lower() for word in sms if word not in punctuation])
#     tokenize = nltk.tokenize.word_tokenize(remove_punct)
#     remove_stopwords = [word for word in tokenize if word not in stopwords]
#     return remove_stopwords
# create a separate column
# data['processed'] = data['message'].apply(lambda x: pre_process(x))

In [None]:
raw.head()

Unnamed: 0,isSpam,message,processed
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [None]:
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame and 'label' is the column containing 'spam' and 'ham'
raw['isSpam'] = LabelEncoder().fit_transform(raw['isSpam'])

In [None]:
raw.head()

Unnamed: 0,isSpam,message,processed
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
x = raw['message'].values
y = raw['isSpam'].values

In [None]:
max_words = 1000
max_len = 150

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x)
X_seq = tokenizer.texts_to_sequences(x)
X_padded = pad_sequences(X_seq, maxlen=max_len)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

NEURAL NETWORK

In [None]:
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(units=100))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epochs = 4
batch_size = 64
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x78a43645e650>

EVALUATE

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 97.87%


TEST

In [110]:
new_messages = ["Free entry to win a prize!", "Hey, how are you?"]
new_messages_seq = tokenizer.texts_to_sequences(new_messages)
new_messages_padded = pad_sequences(new_messages_seq, maxlen=max_len)
predictions = model.predict(new_messages_padded)
predicted_labels = (predictions > 0.5).astype(int).flatten()

for message, label in zip(new_messages, predicted_labels):
    print(f"Message: {message}, Predicted Label: {'Spam' if label == 1 else 'Not Spam'}")

Message: Free entry to win a prize!, Predicted Label: Spam
Message: Hey, how are you?, Predicted Label: Not Spam


So, the accuracy of the model is 97.87% and precision is 100%.

**saving the model**

In [None]:
model.save('spam_detection_model.h5')

  saving_api.save_model(


In [None]:
#code to load the model again in some file
from tensorflow.keras.models import load_model

# Load the model from the file
loaded_model = load_model('spam_detection_model.h5')

In [None]:
#saving as an API
import tensorflow as tf
model.save('spam_detection_saved_model', save_format='tf')

# Load the model from the SavedModel format
loaded_model = tf.keras.models.load_model('spam_detection_saved_model')

**CONCLUSION**


An improvised dataset with more spam types.