In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [6]:
mail = pd.read_csv('/content/Spam-Classification.csv')

In [7]:
mail.head()

Unnamed: 0,CLASS,SMS
0,ham,"said kiss, kiss, i can't do the sound effects..."
1,ham,&lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3,spam,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4,spam,**FREE MESSAGE**Thanks for using the Auction S...


In [8]:
mail.dtypes

Unnamed: 0,0
CLASS,object
SMS,object


In [9]:
mail.count()

Unnamed: 0,0
CLASS,1500
SMS,1500


In [37]:
mail['CLASS'] = mail['CLASS'].map({'ham': 0, 'spam': 1})


##Data Preprocessing

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(mail['SMS'])
sequences = tokenizer.texts_to_sequences(mail['SMS'])


In [43]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Maximum sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [46]:
from tensorflow.keras.utils import to_categorical
labels = mail['CLASS'].values
labels = to_categorical(labels)


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')  # 2 classes: spam and ham
])



In [52]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))


loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 79ms/step - accuracy: 0.4816 - loss: 0.6976 - val_accuracy: 0.4767 - val_loss: 0.6939
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 68ms/step - accuracy: 0.5187 - loss: 0.6930 - val_accuracy: 0.5233 - val_loss: 0.6925
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 110ms/step - accuracy: 0.4855 - loss: 0.6938 - val_accuracy: 0.4767 - val_loss: 0.6942
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 68ms/step - accuracy: 0.4604 - loss: 0.6940 - val_accuracy: 0.4767 - val_loss: 0.6945
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 69ms/step - accuracy: 0.4892 - loss: 0.6937 - val_accuracy: 0.4767 - val_loss: 0.6955
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.4523 - loss: 0.6973
Test Accuracy: 0.48


In [54]:
sample_texts = ["Free entry in a weekly contest!", "Hello, how are you?"]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding='post')
predictions = model.predict(sample_padded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step


In [55]:
for text, pred in zip(sample_texts, predictions):
    print(f"'{text}' is classified as {'SPAM' if pred[1] > pred[0] else 'HAM'}")


'Free entry in a weekly contest!' is classified as HAM
'Hello, how are you?' is classified as HAM
