Some important pointers

1. Vocabulary Initialization and Indexing:
You are initializing the vocabulary with a padding token, but you are not handling out-of-vocabulary (OOV) tokens properly. If a word in the test sentence is not in the vocabulary, it will raise a KeyError.

2. Handling Zero Probabilities:
In Naive Bayes, multiplying probabilities can lead to underflow issues. To avoid this, it's common to use log probabilities instead of raw probabilities.

3. Prior Probabilities:
You are assuming equal prior probabilities for both classes (0.5 for each). However, it's better to calculate the prior probabilities based on the actual distribution of the classes in the training data.

4. Smoothing:
You are adding a very small constant (0.00000000000001) to avoid division by zero. However, it's better to use Laplace smoothing (add-one smoothing) to handle zero probabilities more robustly.

5. Efficiency:
The code can be optimized by avoiding redundant computations and using vectorized operations where possible.

6. Error Handling:
The error handling in transform_sentence and transform_test_sentence is not robust. If a sentence is empty or contains no valid tokens, it should return an empty list or handle it gracefully.

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/vanilla_skies/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vanilla_skies/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/vanilla_skies/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv("spam_or_not_spam.csv")
df.dropna(inplace=True)
df

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [3]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
pattern = r"[^a-zA-Z-]"

# Initialize vocabulary and index
vocab, index = {}, 1
vocab['<pad>'] = 0  # Padding token
vocab['<unk>'] = index  # Unknown token
index += 1

# Function to transform sentence
def transform_sentence(sentence):
    global vocab, index
    try:
        transformed_sentence = sentence.lower()
        transformed_sentence = transformed_sentence.translate(str.maketrans('', '', string.punctuation))
        transformed_sentence = word_tokenize(transformed_sentence)
        transformed_sentence = [
            lemmatizer.lemmatize(w)
            for w in transformed_sentence
            if w not in stop_words and not re.search(pattern, w)
        ]
        for token in transformed_sentence:
            if token not in vocab:
                vocab[token] = index
                index += 1
        transformed_sentence = [vocab.get(word, vocab['<unk>']) for word in transformed_sentence]
    except Exception as e:
        print(e, sentence)
        return [vocab['<unk>']]
    return transformed_sentence

# Transform all sentences in the dataset
features = [transform_sentence(row['email']) for _, row in df.iterrows()]
labels = list(df["label"])

In [4]:
# Calculate prior probabilities
prior_pos = labels.count(0) / len(labels)
prior_neg = labels.count(1) / len(labels)

# Calculate word frequencies with Laplace smoothing
vocab_pos_neg_freq = defaultdict(lambda: [1, 1])  # Laplace smoothing
for i in range(len(features)):
    for token in features[i]:
        if labels[i] == 0:
            vocab_pos_neg_freq[token][0] += 1
        else:
            vocab_pos_neg_freq[token][1] += 1

# Normalize frequencies to get probabilities
for key in vocab_pos_neg_freq:
    total_pos = vocab_pos_neg_freq[key][0] + vocab_pos_neg_freq[key][1]
    vocab_pos_neg_freq[key][0] /= total_pos
    vocab_pos_neg_freq[key][1] /= total_pos

In [5]:
# Function to transform test sentence
def transform_test_sentence(sentence):
    global vocab
    try:
        transformed_sentence = sentence.lower()
        transformed_sentence = transformed_sentence.translate(str.maketrans('', '', string.punctuation))
        transformed_sentence = word_tokenize(transformed_sentence)
        transformed_sentence = [
            lemmatizer.lemmatize(w)
            for w in transformed_sentence
            if w not in stop_words and not re.search(pattern, w)
        ]
        transformed_sentence = [vocab.get(word, vocab['<unk>']) for word in transformed_sentence]
    except Exception as e:
        print(e, sentence)
        return [vocab['<unk>']]
    return transformed_sentence

# Function to classify a test sentence and return the predicted class
def classify_sentence(sentence):
    transformed_sentence = transform_test_sentence(sentence)
    log_pos, log_neg = np.log(prior_pos), np.log(prior_neg)
    for token in transformed_sentence:
        pos_freq, neg_freq = vocab_pos_neg_freq[token]
        log_pos += np.log(pos_freq)
        log_neg += np.log(neg_freq)
    
    # Determine the predicted class
    if log_pos > log_neg:
        return "Not Spam"
    else:
        return "Spam"

In [6]:
# Test examples
test_example = "give me money i need money this is a ad money chinese fake spam buy this buy this viagra viagra fuck me suck me hot girls online ad ad ad hyperlink hyperlink hyperlink"
prediction = classify_sentence(test_example)
print(f"Prediction: {prediction}")

test_example = df["email"][2996]
prediction = classify_sentence(test_example)
print(f"Prediction: {prediction}")

Prediction: Spam
Prediction: Spam
