In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [82]:
data = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [84]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [85]:
data.shape

(5572, 5)

In [88]:
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)

KeyError: "['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] not found in axis"

In [93]:
data.columns

Index(['v1', 'v2'], dtype='object')

In [94]:
data = data.rename(columns={'v1': 'label', 'v2': 'text'})


In [95]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [96]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [46]:
X_train.head(10)

ham   No I'm in the same boat. Still here at my moms. Check me out on yo. I'm half naked.                                                                   NaN    
spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per..             NaN    
ham   They r giving a second chance to rahul dengra.                                                                                                        NaN    
      O i played smash bros  &lt;#&gt;  religiously.                                                                                                        NaN    
spam  PRIVATE! Your 2003 Account Statement for 07973788240 shows 800 un-redeemed S. I. M. points. Call 08715203649 Identifier Code: 40533 Expires 31/10/04  NaN    
ham   G says you never answer your texts, confirm/deny                                                                                                      NaN    
spam  88066 FROM

In [31]:
y_train.head(10)

ham   No I'm in the same boat. Still here at my moms. Check me out on yo. I'm half naked.                                                                   NaN   NaN
spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per..             NaN   NaN
ham   They r giving a second chance to rahul dengra.                                                                                                        NaN   NaN
      O i played smash bros  &lt;#&gt;  religiously.                                                                                                        NaN   NaN
spam  PRIVATE! Your 2003 Account Statement for 07973788240 shows 800 un-redeemed S. I. M. points. Call 08715203649 Identifier Code: 40533 Expires 31/10/04  NaN   NaN
ham   G says you never answer your texts, confirm/deny                                                                                                      NaN   NaN
spam

In [33]:
y_train.isnull().sum()

4457

In [32]:
y_train.shape

(4457,)

In [97]:
# Fill NaN values with an empty string
data['text'].fillna("", inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)


In [98]:
# Convert float values to strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [99]:
X_train.isnull().sum()

0

In [100]:
y_train.isnull().sum()

0

In [101]:
tokenizer.fit_on_texts(X_train)


In [102]:
# Remove rows with float values
X_train = X_train[X_train.apply(lambda x: isinstance(x, str))]
X_test = X_test[X_test.apply(lambda x: isinstance(x, str))]


In [103]:
max_length = 100
vocab_size = 10000
oov_token = "<OOV>"
embedding_dim = 16

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')


In [104]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_3   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 24)                408       
                                                                 
 dense_7 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [105]:
num_epochs = 30
history = model.fit(X_train_padded, y_train, epochs=num_epochs, validation_data=(X_test_padded, y_test), verbose=2)


Epoch 1/30
140/140 - 2s - loss: 0.5464 - accuracy: 0.8618 - val_loss: 0.3817 - val_accuracy: 0.8655 - 2s/epoch - 14ms/step
Epoch 2/30
140/140 - 1s - loss: 0.3538 - accuracy: 0.8661 - val_loss: 0.3459 - val_accuracy: 0.8655 - 657ms/epoch - 5ms/step
Epoch 3/30
140/140 - 1s - loss: 0.3256 - accuracy: 0.8661 - val_loss: 0.3157 - val_accuracy: 0.8655 - 765ms/epoch - 5ms/step
Epoch 4/30
140/140 - 1s - loss: 0.2790 - accuracy: 0.8667 - val_loss: 0.2498 - val_accuracy: 0.8691 - 714ms/epoch - 5ms/step
Epoch 5/30
140/140 - 1s - loss: 0.1762 - accuracy: 0.9291 - val_loss: 0.1450 - val_accuracy: 0.9543 - 725ms/epoch - 5ms/step
Epoch 6/30
140/140 - 1s - loss: 0.0903 - accuracy: 0.9753 - val_loss: 0.0933 - val_accuracy: 0.9731 - 746ms/epoch - 5ms/step
Epoch 7/30
140/140 - 1s - loss: 0.0560 - accuracy: 0.9847 - val_loss: 0.0779 - val_accuracy: 0.9749 - 740ms/epoch - 5ms/step
Epoch 8/30
140/140 - 1s - loss: 0.0413 - accuracy: 0.9879 - val_loss: 0.0713 - val_accuracy: 0.9776 - 733ms/epoch - 5ms/step
Ep

In [106]:
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

