<a href="https://colab.research.google.com/github/sonalisanjeevprabu18/Neural-Network-and-Deep-Learning/blob/main/Spam_mail_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Data Loading
df = pd.read_csv('spam_or_not_spam.csv')
df['email'] = df['email'].fillna('')

X = df['email'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Tokenization
VOCAB_SIZE = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<oov>')
tokenizer.fit_on_texts(X_train)

training_sequences = tokenizer.texts_to_sequences(X_train)
testing_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(training_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(testing_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Model (Improved)
model = Sequential([
    Embedding(VOCAB_SIZE, 32, input_length=MAX_LEN), # Increased embedding size
    Bidirectional(LSTM(32)),                         # Changed to Bidirectional
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training (Silent)
print("Training model... (this may take a moment)")
history = model.fit(
    X_train_padded, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_padded, y_test),
    verbose=0 # Hides the output table
)

# Evaluation
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Spam Score + Prediction
def get_spam_score_and_prediction(emails):
    # Pre-processing: Ensure input matches training format (lowercase)
    cleaned_emails = [e.lower() for e in emails]
    seq = tokenizer.texts_to_sequences(cleaned_emails)
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    probs = model.predict(pad, verbose=0)
    preds = (probs >= 0.5).astype(int)

    for i, email in enumerate(emails):
        print(f"Email: {email}")
        print(f"Spam score: {probs[i][0]:.4f}")
        print(f"Prediction: {'SPAM' if preds[i][0] == 1 else 'NOT SPAM'}")
        print("-" * 50)

# Test
new_emails = [
    "Congratulations! You have won a free lottery. Click here to claim your prize.",
    "Hi, this is your manager can we have a meeting at 6pm?"
]

get_spam_score_and_prediction(new_emails)



Training model... (this may take a moment)
Test Accuracy: 0.9817
Email: Congratulations! You have won a free lottery. Click here to claim your prize.
Spam score: 0.9397
Prediction: SPAM
--------------------------------------------------
Email: Hi, this is your manager can we have a meeting at 6pm?
Spam score: 0.9443
Prediction: SPAM
--------------------------------------------------
