<a href="https://colab.research.google.com/github/sonalisanjeevprabu18/Neural-Network-and-Deep-Learning/blob/main/Spam_mail_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pandas numpy scikit-learn tensorflow

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Data Loading
df = pd.read_csv('spam_or_not_spam.csv')
df['email'] = df['email'].fillna('')

X = df['email'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Tokenization
VOCAB_SIZE = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<oov>')
tokenizer.fit_on_texts(X_train)

training_sequences = tokenizer.texts_to_sequences(X_train)
testing_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(training_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(testing_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Model
model = Sequential([
    Embedding(VOCAB_SIZE, 16, input_length=MAX_LEN),
    LSTM(32),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Training
history = model.fit(
    X_train_padded, y_train,
    epochs=10, batch_size=32,
    validation_data=(X_test_padded, y_test),
    verbose=1
)

# Evaluation
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Spam Score + Prediction
def get_spam_score_and_prediction(emails):
    seq = tokenizer.texts_to_sequences(emails)
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    probs = model.predict(pad)
    preds = (probs >= 0.5).astype(int)

    for i, email in enumerate(emails):
        print(f"Email: {email}")
        print(f"Spam score: {probs[i][0]:.4f}")
        print(f"Prediction: {'SPAM' if preds[i][0] == 1 else 'NOT SPAM'}")
        print("-" * 50)

# Test with custom emails
new_emails = [
    "Congratulations! You have won a free lottery. Click here to claim your prize.",
    "Hi, can we reschedule our meeting to tomorrow afternoon?"
]

get_spam_score_and_prediction(new_emails)




Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 54ms/step - accuracy: 0.7976 - loss: 0.5761 - val_accuracy: 0.8333 - val_loss: 0.4285
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.8658 - loss: 0.3613 - val_accuracy: 0.9433 - val_loss: 0.2008
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.9363 - loss: 0.1856 - val_accuracy: 0.8400 - val_loss: 0.2740
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.8931 - loss: 0.2355 - val_accuracy: 0.9467 - val_loss: 0.1741
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9420 - loss: 0.2059 - val_accuracy: 0.9583 - val_loss: 0.1708
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.9726 - loss: 0.1556 - val_accuracy: 0.9550 - val_loss: 0.1601
Epoch 7/10
[1m75/75[0m [32m━━━━