# SPAM DETECTION MODEL

In [1]:
#LOADING THE REQUIRED MODULES
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report




In [2]:
#Loading the dataset into python
df=pd.read_csv('spam_ham_dataset.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [4]:
#describing the dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [5]:
#checking for null values
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [38]:
#brief information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
#checking number of spam and ham messages
df['label'].value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

# SPLITTING DATASET INTO TRAIN DATA AND TEST DATA

In [7]:
#splitting data into two halves to train the model
x=df.drop(columns='label_num',axis=1)
y=df['label_num']

In [8]:
#breaking down the texts into small parts called tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post')

labels = df['label_num'].values 


In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)


# TENSORFLOW MODEL

In [11]:
#creating a keras model 
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1,
                              output_dim=128,
                              input_length=X_train.shape[1]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [13]:
#getting the accuracy of the trained model
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    validation_data=(X_test,y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
#evaluating accuracy
loss, accuracy = model.evaluate(X_test,y_test)
print(f"Accuracy: {accuracy*100:.2f}%")


Accuracy: 98.26%


In [16]:
#predicting 
y_pred = model.predict(X_test)



# REPORT

In [17]:
#creating a classification report
report = classification_report(y_test, np.round(y_pred), target_names=['ham', 'spam'])

In [18]:
print(report)

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       744
        spam       0.97      0.97      0.97       291

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



.The precision of the model for ham messages is 0.99 and for spam messages is 0.97. The recall of the model for ham messages is 0.99 and for spam messages is 0.97. The f1-score of the model for ham messages is 0.99 and for spam messages is 0.97. The accuracy of the model is 0.98 
.The macro average of the model is 0.98. The weighted average of the model is 0.98 