In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
df = pd.read_csv("emails.csv").dropna()
df.sample(5)

Unnamed: 0,text,spam
5209,Subject: term paper please respond to vince ...,0
342,Subject: 9 % commission on myg annuities call...,1
3196,"Subject: re : fw : eprm article chris , i ha...",0
4156,Subject: re : requests for help thanks vince .,0
5505,"Subject: aram ' s visit jesus , friday , apr...",0


In [6]:
print(df.duplicated().sum())

33


In [8]:
df.drop_duplicates(inplace=True)

In [10]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

In [12]:
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['text'])
X_seq = tokenizer.texts_to_sequences(df['text'])
X_pad = pad_sequences(X_seq, maxlen=max_len)
y = df['spam']

In [20]:
X_temp, X_test, y_temp, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Second split: train and val
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [22]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.array([0, 1]),
                                     y=y_train)
print("Class Weights:", class_weights)

Class Weights: [0.65357015 2.12792056]


In [24]:
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

print("\nX_train example:\n", X_train[:5])
print("\nX_val example:\n", X_val[:5])

X_train shape: (3643, 100)
X_val shape: (911, 100)
y_train shape: (3643,)
y_val shape: (911,)

X_train example:
 [[   0    0    0    0    0    0    0    0    0   15   45 1174  569   22
   173  130  744  139  913  120   32  913 1750   10  150   11  138  148
    52  148 1216   36  637  138  216   52   96  138   51    2  913 1750
    10  150   10  150   46 1327   10  150   10  150   15   45 1174  569
   913  130    2   82    6   62    8   71   25    5 1064    4  971    7
   238   20 3863  390 2367  225   65   17  224 2068   74    6 1086  347
     5 2516  771    3   19    6 1086  421    5  266  497  139    2    1
   989  637]
 [ 120   32  173  296   31   13   11   96  174   37   96  300   36  365
   570   96  283   37  121  985   51    2  173  296   31   13   13   46
    22   47   39   31   13   13   15 1606 2193 4308 1124  135  173    8
   955    1  520  202  328  453    2  569  158    1  203    4 2065   12
   910    3    1 1121   12   40  534  926    1  135    5  605    4  244
    32  39

In [26]:
# First, split the original text data using the same indices as X_combined
X_train_indices, X_val_indices = train_test_split(df.index, test_size=0.2, random_state=42)

# Use these indices to get the original text data
X_train_texts = df.loc[X_train_indices, 'text'].tolist()
X_val_texts = df.loc[X_val_indices, 'text'].tolist()

print("Text data processed for Deep Learning models!")

Text data processed for Deep Learning models!


In [28]:
def create_model(model_type):
    model = Sequential()
    model.add(Embedding(max_words, 50, input_length=max_len))

    if model_type == "CNN":
        model.add(Conv1D(64, 5, activation='relu'))
        model.add(GlobalMaxPooling1D())
    elif model_type == "RNN":
        model.add(SimpleRNN(64))
    elif model_type == "LSTM":
        model.add(LSTM(64))
    elif model_type == "GRU":
      model.add(GRU(64))
    else:
        raise ValueError("Invalid model type")

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [30]:
# Training and Evaluation
results = []
model_types = ["CNN", "RNN", "LSTM", "GRU"]

for model_type in model_types:
    print(f"Training {model_type} model...")
    model = create_model(model_type)

    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0,
              validation_data=(X_test, y_test),  # Use test set for validation
              callbacks=[EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)])

    y_pred = (model.predict(X_test, verbose=0) > 0.5).astype("int32")
    y_prob = model.predict(X_test, verbose=0).flatten()

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    results.append([model_type, accuracy, precision, recall, f1, auc])

Training CNN model...




Training RNN model...




Training LSTM model...




Training GRU model...




In [32]:
# Display results in a table
df_results = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])
print(df_results)

  Model  Accuracy  Precision    Recall  F1-Score       AUC
0   CNN  0.977173   0.968750  0.942568  0.955479  0.997936
1   RNN  0.970149   0.978102  0.905405  0.940351  0.996289
2  LSTM  0.972783   0.940199  0.956081  0.948074  0.996201
3   GRU  0.970149   0.948630  0.935811  0.942177  0.994433
