## importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score

## Load Dataset

In [2]:
spam= pd.read_csv("spam_ham_dataset.csv")
spam.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
spam.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


In [4]:
spam.head(10)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1
8,2641,ham,Subject: noms / actual flow for 2 / 26\r\nwe a...,0
9,1870,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",0


## get the information and describe the dataset

In [5]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
spam.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


## Checking Null values and missing values

In [7]:
missing_values=spam.isnull().sum()
missing_values

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

## Removing Unwanted features in dataset. 

In [8]:
df = spam.drop('Unnamed: 0',axis = 1)
df = spam.drop('label',axis = 1)
df

Unnamed: 0.1,Unnamed: 0,text,label_num
0,605,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,"Subject: photoshop , windows , office . cheap ...",1
4,2030,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,1518,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,Subject: industrial worksheets for august 2000...,0


## split the data into features (x) and target(y)

In [9]:
x = df['text']
y = df['label_num']
X_train ,x_test, Y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [10]:
print(x) 

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [11]:
print(y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


## Initalizing the max_length and embedded_dim.

In [29]:
from keras.datasets import imdb
from tensorflow.keras.layers import Embedding

## Tokenization for train the model.


In [30]:
max_len = 100
tokenizer = Tokenizer(oov_token='<00V>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(sequences, padding='post',maxlen = max_len,truncating='post')
x_test_pad = pad_sequences(sequences, padding='post',maxlen = max_len,truncating='post')

In [31]:
vol=len(word_index)+1
embedding_dim = 100

## building the model

In [33]:
import tensorflow as tf
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vol,output_dim=embedding_dim),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32,activation='relu'),
        tf.keras.layers.Dense(1,activation='sigmoid')
])

## Compiling the model for further processes.

In [34]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

## Training the model

In [17]:
model.fit(X_train_pad,Y_train,epochs=5,validation_split=0.2)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 113ms/step - accuracy: 0.7418 - loss: 0.5088 - val_accuracy: 0.9614 - val_loss: 0.1049
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 103ms/step - accuracy: 0.9897 - loss: 0.0336 - val_accuracy: 0.9626 - val_loss: 0.0967
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 104ms/step - accuracy: 0.9975 - loss: 0.0074 - val_accuracy: 0.9457 - val_loss: 0.1664
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 99ms/step - accuracy: 0.9951 - loss: 0.0198 - val_accuracy: 0.9626 - val_loss: 0.0906
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 113ms/step - accuracy: 0.9997 - loss: 0.0015 - val_accuracy: 0.9614 - val_loss: 0.0797


<keras.src.callbacks.history.History at 0x1f5e4e4f790>

## predicted value

In [18]:
predictions = model.predict(x_test_pad)
binary_predictions = (predictions > 0.5).astype(int)
binary_predictions

[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])