In [8]:
# import resources
%matplotlib inline
# note sure if we need these all:
#from PIL import Image
#from io import BytesIO
#import matplotlib.pyplot as plt

#import torch
#import torch.optim as optim
#import requests
#from torchvision import transforms, models
import numpy as np
import re
import pandas as pd


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [2]:

# load the dataset and check it
df = pd.read_csv('data/Enron.csv')
df.head()
# 1 means marked as spam/malicious
row = df[df['label'] == 1].iloc[0]

print("Subject:\n", row['subject'])
print("\nBody:\n", row['body'])
print("\nLabel:", row['label'])


Subject:
 d - link dwl - g 510 802 . 11 g wireless pci lan adapter @ $ 39 . 85

Body:
 $ 39 . 85 dwl - g 510
high speed
2 . 4
ghz ( 802 . 11 g ) wireless pci lan
adapter
ieee 802 . 11 g standardupto
54 mbpsoperating frequency range - 2 . 4
ghz " is an ideal solution enabling wireless networking
capabilities on desktops pcs for the home or
office . "
dwl - g 510
visit : http : / / www . computron - me . com for deals !
d - link dwl - g 510
802 . 11 g wireless pci lan adapter
the d - link g 510 is a wireless pci
adapter featuring the latest ieee 802 . 11 g wireless
technology to deliverincredibly fast data transfer
in the 2 . 4 ghz frequency . the g 510 features the 802 . 11 g
standard and is backwards compatible with all the
existing 802 . 11 / 11 b products already out there . it also
offers a 64 / 128 - bit wep encryption and includes a
removable antenna and driver cd . get your d - link g 510
wireless pci lan adapter today !
general features :
- pci interface -
removable antenna - 64

In [3]:

# get rid of puncation and other things
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [4]:
# combine the subject and mode / prep for tokens -> CNN

df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')
df['text'] = df['text'].apply(clean_text)

In [5]:
# tokenization for CNN

vocab_size = 10000
max_length = 100
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

labels = df['label'].values 
# check if tokenization worked?
i = 0 
print("triginal text:\n", df['text'].iloc[i])
print("\ntokenized sequence:\n", sequences[i])
print("\npadded sequence:\n", padded_sequences[i])


2025-07-10 20:12:08.779861: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752203528.815486  141302 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752203528.825789  141302 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752203528.853071  141302 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752203528.853100  141302 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752203528.853104  141302 computation_placer.cc:177] computation placer alr

Original text:
 hpl nom for may 25  2001  see attached file  hplno 525  xls 
 hplno 525  xls

Tokenized sequence:
 [360, 1156, 9, 61, 237, 55, 151, 207, 319, 3906, 1, 969, 2, 3906, 1, 969]

Padded sequence:
 [ 360 1156    9   61  237   55  151  207  319 3906    1  969    2 3906
    1  969    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [7]:
# split the data for training

# test_size = 0.2 means set aside 20% of the data for validation
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

print("total samples:", len(padded_sequences))
print("training samples:", len(X_train))
print("validation samples:", len(X_val))


Total samples: 29767
Training samples: 23813
Validation samples: 5954
Training labels distribution:
  Spam (1): 11207, Not spam (0): 12606
Validation labels distribution:
  Spam (1): 2769, Not spam (0): 3185
