<a href="https://colab.research.google.com/github/sayandas96476/ML/blob/main/email_spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
saksham177_spam_mail_detection_path = kagglehub.dataset_download('saksham177/spam-mail-detection')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers
!pip install -q sentence-transformers



In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import re

from sentence_transformers import SentenceTransformer

model_name = "intfloat/e5-large"

model = SentenceTransformer(model_name, trust_remote_code=True)



In [None]:

def process_text(sentences):
    # Generate embeddings using E5 large model
    embeddings = model.encode(sentences, show_progress_bar=True)
    return sentences, embeddings


In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/spam-mail-detection/Day17_Mail_Data.csv')

In [None]:
df.head(5)

In [None]:
df_spam = df[df['Category']=='spam']
df_spam.shape
df_ham = df[df['Category']=='ham']
df_ham.shape
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)
msg = df_balanced['Message']
msg = msg.tolist()


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


In [None]:
sentence, embeddings = process_text(msg)

In [None]:
embeddings = embeddings.tolist()
df_balanced['Message'] = embeddings

In [None]:
!pip install tensorflow --upgrade

In [None]:
pip show transformers

In [None]:
!pip install -q scikit-learn

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Configure learning rate
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100,
    decay_rate=0.9,
    staircase=True
)

# Create the model
inputs = tf.keras.Input(shape=(1024,))
#l = keras.layers.Dense(1, activation='sigmoid', name="output")(inputs)
l = tf.keras.layers.Dropout(0.1, name="dropout")(inputs)
l = keras.layers.Dense(1, activation='sigmoid', name="output")(l)

model = keras.Model(inputs=inputs, outputs=l)

# Compile the model with Adam optimizer and learning rate schedule
model.compile(
    optimizer=Adam(learning_rate=lr_schedule),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
# Configure Early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,         # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True,  # Restore model weights from the epoch with the best value of the monitored quantity
    min_delta=0.001      # Minimum change to qualify as an improvement
)

model.summary()



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
# Convert the list of NumPy arrays to a single NumPy array
X_train_array = np.array(list(X_train))
X_test_array = np.array(list(X_test))
y_train_array = np.array(y_train)
y_test_array = np.array(y_test)

try:
    # Fit the model with Early Stopping and specified batch size
    history = model.fit(
        X_train_array,
        y_train_array,
        epochs=150,
        batch_size=32,  # Specify batch size
        validation_data=(X_test_array, y_test_array),
        callbacks=[early_stopping]
    )
except Exception as E:
    print(E)

In [None]:
# Plot training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(model, X_test, y_test):
    """
    Generate and plot a confusion matrix for a binary classification model

    Parameters:
    - model: Trained Keras model
    - X_test: Test input features
    - y_test: True test labels
    """
    # Predict probabilities
    y_pred_proba = model.predict(X_test)

    # Convert probabilities to binary predictions (0 or 1)
    # Using 0.5 as the threshold for binary classification
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()

    # Ensure y_test is flattened and has same shape as y_pred
    y_test_flat = y_test.flatten()

    # Compute confusion matrix
    cm = confusion_matrix(y_test_flat, y_pred)

    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['Actual Negative', 'Actual Positive'])

    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

    # Calculate and print additional metrics
    tn, fp, fn, tp = cm.ravel()
    print("Confusion Matrix Metrics:")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")

    # Calculate precision, recall, and F1 score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"\nPrecision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")

# Usage example (replace with your actual data)
plot_confusion_matrix(model2, X_test_array, y_test_array)