In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
file_path = "/content/drive/MyDrive/mail_data.csv"
data = pd.read_csv(file_path)

In [3]:
# Preprocessing
# Assuming the dataset has 'label' (spam/ham) and 'text' columns
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])

In [4]:
X = data['Message']
y = data['Category']

In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [6]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [7]:
X_train_padded = pad_sequences(X_train_seq, maxlen=100)
X_test_padded = pad_sequences(X_test_seq, maxlen=100)

In [8]:
# Build RNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dense(1, activation='sigmoid')
])



In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 129ms/step - accuracy: 0.8811 - loss: 0.3114 - val_accuracy: 0.9776 - val_loss: 0.0837
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - accuracy: 0.9895 - loss: 0.0407 - val_accuracy: 0.9821 - val_loss: 0.0715
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - accuracy: 0.9955 - loss: 0.0182 - val_accuracy: 0.9821 - val_loss: 0.0811
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 120ms/step - accuracy: 0.9986 - loss: 0.0079 - val_accuracy: 0.9832 - val_loss: 0.0794
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 124ms/step - accuracy: 0.9998 - loss: 0.0023 - val_accuracy: 0.9832 - val_loss: 0.0834


<keras.src.callbacks.history.History at 0x7c9f8f084f10>

In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy}")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.9871 - loss: 0.0633
Test Accuracy: 0.9883407950401306


In [12]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.058238185942173004
Test Accuracy: 0.9883407950401306


In [13]:
# Predict on test data
y_pred_probs = model.predict(X_test_padded)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step


In [14]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [15]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.9857142857142858
Recall: 0.9261744966442953
F1 Score: 0.9550173010380623


In [16]:
# Classification report for a detailed breakdown
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       966
        Spam       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [19]:
# Function to preprocess and predict user input
def predict_email(email_text):
    # Tokenize and pad the input email text
    email_seq = tokenizer.texts_to_sequences([email_text])
    email_padded = pad_sequences(email_seq, maxlen=100)

    # Predict using the trained model
    prediction = model.predict(email_padded)
    return "Spam" if prediction > 0.5 else "Ham"

# Take user input
print("\n=== Email Spam Detection ===")
while True:
    user_input = input("Enter an email to classify (or type 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        break
    result = predict_email(user_input)
    print(f"Prediction: {result}\n")


=== Email Spam Detection ===
Enter an email to classify (or type 'exit' to quit): Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Prediction: Ham

Enter an email to classify (or type 'exit' to quit): Upgrade to our premium plan for exclusive access to premium content and features.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Prediction: Spam

Enter an email to classify (or type 'exit' to quit): You're a winner! Click here to claim your exclusive prize.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Prediction: Spam

Enter an email to classify (or type 'exit' to quit): Thank you for your feedback. We're always striving to improve our services.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction: Ham

Enter an email to classify (or type 'exit' to quit): We're sorry for th