In [87]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [88]:
df = pd.read_csv(r"C:\Users\sanja\Desktop\RNN-Spam\spam (1).csv", encoding="latin1")

In [89]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [90]:
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

In [91]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [92]:
df['label']=df['label'].map({'ham':0, 'spam':1})
texts=df['text'].values
labels=df['label'].values

In [93]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [94]:
sequences=tokenizer.texts_to_sequences(texts)
word_index=tokenizer.word_index
vocab_size=len(word_index)+1

In [95]:
max_len=max(len(s) for s in sequences)
X=pad_sequences(sequences, maxlen=max_len, padding='post')

In [96]:
y=np.array(labels)

In [97]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
from sklearn.utils import compute_class_weight
classes=np.unique(y_train)
class_weights=compute_class_weight(class_weight='balanced',classes=classes, y=y_train)
class_weight_dict = dict(zip(classes,class_weights))
print(class_weight_dict)

{np.int64(0): np.float64(0.5773316062176166), np.int64(1): np.float64(3.7328308207705194)}


In [99]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=16, input_length=max_len),
    SimpleRNN(16),
    Dense(1, activation='sigmoid')
])



In [100]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [101]:
model.fit(
    X_train,
    y_train,
    epochs=20,
    validation_data=(X_test, y_test),
    verbose=1,
    class_weight=class_weight_dict
)


Epoch 1/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.6516 - loss: 0.6491 - val_accuracy: 0.8834 - val_loss: 0.4928
Epoch 2/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8892 - loss: 0.4227 - val_accuracy: 0.8682 - val_loss: 0.3751
Epoch 3/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9587 - loss: 0.2113 - val_accuracy: 0.9175 - val_loss: 0.2798
Epoch 4/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9746 - loss: 0.1320 - val_accuracy: 0.9399 - val_loss: 0.1889
Epoch 5/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9854 - loss: 0.0875 - val_accuracy: 0.9336 - val_loss: 0.2032
Epoch 6/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9832 - loss: 0.0869 - val_accuracy: 0.8861 - val_loss: 0.3402
Epoch 7/20
[1m140/140

<keras.src.callbacks.history.History at 0x1a584e08760>

In [102]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 93.63%


In [103]:
# Save model
model.save("spam_rnn_model.h5")
print("✅ Model saved as spam_rnn_model.h5")




✅ Model saved as spam_rnn_model.h5


In [104]:
import pickle
with open("max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)


In [105]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved!")


Tokenizer saved!
