In [1]:
# Install gensim library
!pip install gensim



In [2]:
# Import necessary modules from gensim library
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [3]:
# Import pandas for data manipulation
import pandas as pd

# Step 1: Read the CSV file and handle headers automatically
messages = pd.read_csv('final_email_dataset.csv')

# Check if the first row is mistakenly inside the data and remove it
if messages.iloc[0, 0].lower() == 'label':
    messages = messages.drop(index=0).reset_index(drop=True)

# Keep only rows with labels 'ham' or 'spam'
messages = messages[messages['label'].isin(['ham', 'spam'])].reset_index(drop=True)

In [4]:
# Display the first few rows of the dataframe
messages.head()


Unnamed: 0,text,label
0,"Congratulations, you've won a free vacation! C...",spam
1,"Hi there, just wanted to follow up on our meet...",ham
2,100% Guaranteed: You'll love this product! Buy...,spam
3,Reminder: Your subscription to our newsletter ...,spam
4,"Dear John, I hope this email finds you well.",ham


In [5]:
# Display the first few rows of the dataframe
messages.head()

Unnamed: 0,text,label
0,"Congratulations, you've won a free vacation! C...",spam
1,"Hi there, just wanted to follow up on our meet...",ham
2,100% Guaranteed: You'll love this product! Buy...,spam
3,Reminder: Your subscription to our newsletter ...,spam
4,"Dear John, I hope this email finds you well.",ham


In [6]:
# Import necessary modules for text processing
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
import nltk

In [7]:
# Download stopwords for text processing
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

## Data Cleaning

In [8]:
# Initialize an empty list to store processed text data
corpus = []

# Loop over each message to clean and lemmatize the text
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['text'][i])  # Remove non-letter characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Split into words
    review = [lemmatizer.lemmatize(word) for word in review]  # Lemmatize each word
    review = ' '.join(review)  # Join the words back into a single string
    corpus.append(review)  # Append to corpus

In [9]:
# Display the processed corpus
corpus

['congratulation you ve won a free vacation click here to claim your prize',
 'hi there just wanted to follow up on our meeting last week',
 'guaranteed you ll love this product buy now and get a discount',
 'reminder your subscription to our newsletter is about to expire',
 'dear john i hope this email find you well',
 'you re pre approved for a new credit card apply now',
 'important your account will be suspended in day if you don t act',
 'hi mom how s it going',
 'special offer for deal on pizza order now and save',
 'meeting rescheduled for next tuesday at pm',
 'hi mark i d like to discus the project timeline',
 'limited time offer buy one get one free on all item',
 'just checked your bank balance everything look good',
 'you ve been selected to receive a special offer',
 'please find attached your monthly invoice',
 'act now and get a free gift with your purchase',
 'hi sarah i d be happy to help you with your issue',
 'congratulation you ve won a free vacation click here to c

In [10]:
# Get the length of the corpus
len(corpus)

90

In [11]:
# Import additional necessary modules
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

# Download additional resources for tokenization and stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [12]:
# Initialize the set of stop words 
stop_words = set(stopwords.words('english'))

# Tokenize the sentences and remove stopwords
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for s in sent_token:
        tokens = simple_preprocess(s)
        tokens = [w for w in tokens if w not in stop_words]
        words.append(tokens)

In [13]:
# Display the list of tokenized words
words

[['congratulation', 'free', 'vacation', 'click', 'claim', 'prize'],
 ['hi', 'wanted', 'follow', 'meeting', 'last', 'week'],
 ['guaranteed', 'love', 'product', 'buy', 'get', 'discount'],
 ['reminder', 'subscription', 'newsletter', 'expire'],
 ['dear', 'john', 'hope', 'email', 'find', 'well'],
 ['pre', 'approved', 'new', 'credit', 'card', 'apply'],
 ['important', 'account', 'suspended', 'day', 'act'],
 ['hi', 'mom', 'going'],
 ['special', 'offer', 'deal', 'pizza', 'order', 'save'],
 ['meeting', 'rescheduled', 'next', 'tuesday', 'pm'],
 ['hi', 'mark', 'like', 'discus', 'project', 'timeline'],
 ['limited', 'time', 'offer', 'buy', 'one', 'get', 'one', 'free', 'item'],
 ['checked', 'bank', 'balance', 'everything', 'look', 'good'],
 ['selected', 'receive', 'special', 'offer'],
 ['please', 'find', 'attached', 'monthly', 'invoice'],
 ['act', 'get', 'free', 'gift', 'purchase'],
 ['hi', 'sarah', 'happy', 'help', 'issue'],
 ['congratulation', 'free', 'vacation', 'click', 'claim', 'prize'],
 ['hi',

In [14]:
# Train a Word2Vec model using the list of words
# Learn word embeddings (vectors) from data and represent each word as a vector in high-dimensional space
model = gensim.models.Word2Vec(
    sentences=words,
    vector_size=50,  # Dimensionality of word vectors
    window=3,       # Maximum distance between current and predicted word within a sentence
    min_count=1,    # Ignores all words with total frequency lower than this
    workers=2,      # Use these many worker threads for training
    sg=1            # Use skip-gram; 0 for CBOW model
)


In [15]:
# Get the vocabulary of every word in our words variable
model.wv.index_to_key[:20]

['hi',
 'free',
 'offer',
 'get',
 'special',
 'meeting',
 'find',
 'one',
 'buy',
 'act',
 'follow',
 'last',
 'vacation',
 'day',
 'suspended',
 'account',
 'important',
 'apply',
 'card',
 'credit']

In [16]:
# Get the total number of words processed
model.corpus_count

90

In [17]:
# Find words most similar to 'monthly'
model.wv.similar_by_word('monthly')

[('vacation', 0.36856910586357117),
 ('tuesday', 0.29542070627212524),
 ('hi', 0.27782341837882996),
 ('get', 0.270143061876297),
 ('love', 0.2383166253566742),
 ('card', 0.2358538806438446),
 ('one', 0.21403980255126953),
 ('selected', 0.20887678861618042),
 ('day', 0.20827554166316986),
 ('pizza', 0.20299981534481049)]

In [18]:
# Define a function to calculate the average Word2Vec vector for a document
def avg_word2vec(doc):
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    
    if vectors:  # Check if the list is not empty
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector if no valid words are found
        return np.zeros(model.vector_size)

In [19]:
# Install tqdm library for progress bar visualization
!pip install tqdm



In [21]:
# Import tqdm for progress bar visualization
from tqdm import tqdm
import numpy as np

# Apply avg_word2vec function to the entire corpus to transform documents into vectors
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 6139.70it/s]


In [22]:
# Display the transformed data
X

[array([-0.00204717,  0.00210147,  0.00402732, -0.00576002,  0.01094132,
         0.00037016,  0.00257408, -0.00151838, -0.00055653,  0.00018737,
        -0.00098187, -0.00318377, -0.00315207,  0.00471438, -0.00159471,
        -0.00225011,  0.00171472,  0.00923826,  0.0043497 , -0.0061512 ,
         0.00304265,  0.00368663,  0.00175384,  0.00103629,  0.00777758,
         0.00144478,  0.00736454,  0.00134239,  0.00159436, -0.00083253,
        -0.00225279,  0.00596071, -0.00156169, -0.0021961 , -0.0078115 ,
         0.0024755 , -0.00239933, -0.00652225, -0.00158047, -0.00418153,
         0.00743738,  0.0033646 ,  0.0096639 , -0.00476576, -0.00014003,
         0.00398387, -0.00543086, -0.01071233, -0.00424874,  0.00371673],
       dtype=float32),
 array([ 2.5191407e-03, -5.8599673e-03, -1.5142303e-03,  4.1299253e-03,
        -9.4205337e-03, -4.6545709e-03,  7.0241555e-03,  3.8953219e-03,
        -8.5243955e-03, -9.9678570e-03,  4.3395078e-03, -4.3695751e-03,
         4.4262228e-03,  1.591

In [23]:
# Check the number of vectors created
len(X)

90

In [24]:
# Verify the shape of the original data
messages.shape

(90, 2)

In [25]:
# Re-run label encoding to transform 'ham' and 'spam' labels to numeric
from sklearn.preprocessing import LabelEncoder
y = messages['label']
le = LabelEncoder()
y = le.fit_transform(y)

In [26]:
# Display the classes and first 10 transformed labels
print(le.classes_)  # Should now show: ['ham' 'spam']
print(y[:10])       # Should now only contain 0s and 1s

['ham' 'spam']
[1 0 1 1 0 1 1 0 1 0]


In [27]:
# Display unique labels in original dataset
print(messages['label'].unique())

['spam' 'ham']


In [28]:
# Display transformed labels
y

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0])

In [29]:
# Convert vectors to pandas DataFrame
df = pd.DataFrame()
for i in range(0, len(X)):
    temp_df = pd.DataFrame(X[i].reshape(1, -1))
    df = pd.concat([df, temp_df], ignore_index=True)

In [30]:
# Display the shape of the DataFrame
df.shape

(90, 50)

In [31]:
# Set the features DataFrame
X = df

In [32]:
# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [33]:
# Display the first few rows of the training data
X_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
49,0.003184,0.002613,0.003767,-0.003444,-0.006197,0.000282,0.000455,0.009161,0.00067,-0.001348,...,0.001319,-0.00687,0.00851,-0.009887,0.000802,-0.012532,0.004447,-0.001425,0.002285,6e-05
62,-0.00487,0.000604,-0.005192,-0.002904,0.005808,-0.002846,-0.002753,0.007992,-0.001545,-0.008776,...,-0.00166,-0.006473,0.002739,-0.00301,0.001007,0.010503,-0.002673,-0.005288,-0.000583,0.001141
73,0.002519,-0.00586,-0.001514,0.00413,-0.009421,-0.004655,0.007024,0.003895,-0.008524,-0.009968,...,0.001746,0.001446,-0.005211,9.5e-05,0.00387,-0.000397,-0.005533,0.00151,0.009807,-0.001042
69,-0.003728,0.001855,-0.005548,-0.000548,0.005946,-0.00016,0.00832,-0.000729,0.002743,0.006423,...,-0.00511,0.007539,-0.00076,-0.012308,0.0015,0.003932,-0.005184,-0.005302,3e-06,0.006672
76,-0.000272,0.005499,-0.002341,-0.004271,0.000413,-0.004961,0.001034,0.003714,0.005057,-0.002273,...,0.008336,0.000498,0.005674,-8.8e-05,0.005157,-0.002548,0.001356,-0.002008,-0.000102,0.005102


In [34]:
# Display training labels
y_train

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0])

In [35]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

In [36]:
# Initialize and train the classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [37]:
# Display the test labels
y_test

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1])

In [38]:
# Make predictions on the test dataset
y_pred = classifier.predict(X_test)

In [39]:
# Display the predicted labels
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1])

In [40]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        11

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [42]:
# Perform cross-validation to evaluate the model's performance
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X, y, cv=5)  # 5-fold cross-validation

In [43]:
# Display cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0
