# **Naive Bayes Spam Filter**
### CMSC 197 (Machine Learning), Machine Problem 3
*Submitted by: Sharah Michelle Tuando*

This is a separate file (for showing the effect of removing stop words in terms of precision, recall, and accuracy) to ensure that the cleaned data will not be affected.

In [None]:
##### Standard Libraries #####
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
import email
import os
import chardet

##### For Validation of the Model #####
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### **Preprocessing**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
dir_data = '/content/drive/My Drive/197 (ML)/trec06p-cs280/unzipped/data/'
dir_label = '/content/drive/My Drive/197 (ML)/trec06p-cs280/labels.txt'
dir_stop_words = '/content/drive/My Drive/197 (ML)/stop_words.txt'

Mounted at /content/drive


In [None]:
def load_stop_words(file_path):
  with open(file_path, 'r') as f:
    stop_words = set(f.read().splitlines())
  return stop_words

# Load stop words
stop_words = load_stop_words(dir_stop_words)
stop_words_df = pd.DataFrame({'stop_word': list(stop_words)})
stop_words_df

Unnamed: 0,stop_word
0,shown
1,later
2,they'll
3,ninety
4,happens
...,...
666,was
667,mostly
668,ca
669,don't


In [None]:
df_with_stop_words_emails = pd.DataFrame(columns=['folder', 'file', 'message', 'classification'])
df_labels = pd.read_csv(dir_label, sep='\s+', header=None, names=['classification', 'folder_file'], engine='python')
df_labels['classification'] = df_labels['classification'].apply(lambda x: 0 if x == 'ham' else 1)

def preprocess_text(text):
    # Lowercase, remove HTML tags, URLs, punctuation, numbers, and extra spaces
    punctuations = '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', punctuations + '0123456789'))  # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

def get_messages(parsed_email, charset, email_path):
    if parsed_email.is_multipart():
        for part in parsed_email.walk():
            if part.get_content_type() == 'text/plain':
                payload = part.get_payload(decode=True)
                if payload is not None:
                    charset = part.get_content_charset() or charset or 'utf-8'
                    try:
                        return payload.decode(charset, errors='ignore').strip()
                    except LookupError:
                        try:
                            return payload.decode('latin-1', errors='ignore').strip()
                        except:
                            return ''
                else:
                    return ''
    payload = parsed_email.get_payload(decode=True)
    if payload is not None:
        charset = parsed_email.get_content_charset() or charset or 'utf-8'
        try:
            return payload.decode(charset, errors='ignore').strip()
        except LookupError:
            try:
                return payload.decode('latin-1', errors='ignore').strip()
            except:
                return ''
    else:
        return ''

def get_email_charset(email_path):
    with open(email_path, 'rb') as f:
        rawdata = f.read()
    result = chardet.detect(rawdata)
    charset = result['encoding']
    return charset

def process_email(email_path):
    charset = get_email_charset(email_path)

    with open(email_path, 'rb') as e_mail:
        parsed_email = email.message_from_bytes(e_mail.read())

    message = get_messages(parsed_email, charset, email_path)
    # Preprocess extracted message
    processed_message = preprocess_text(message)
    return processed_message

processed_emails = {}

for _, row in df_labels.iterrows():
    folder, file = row['folder_file'].split('/')[-2:]
    classification = row['classification']

    email_path = os.path.join(dir_data, folder, file)

    processed_message = process_email(email_path)

    processed_emails[email_path] = {
        'folder': folder,
        'file': file,
        'message': processed_message,
        'classification': classification
    }

    df_with_stop_words_emails = pd.concat([df_with_stop_words_emails, pd.DataFrame([{
        'folder': folder,
        'file': file,
        'message': processed_message,
        'classification': classification
    }])], ignore_index=True)

df_with_stop_words_emails

Unnamed: 0,folder,file,message,classification
0,000,000,the mailing list i queried about a few weeks a...,0
1,000,001,luxury watches buy your own rolex for only rol...,1
2,000,002,academic qualifications available from prestig...,1
3,000,003,greetings all this is to verify your subscript...,0
4,000,004,try chauncey may conferred the luscious not co...,1
...,...,...,...,...
37817,126,017,great news expec ted infinex ventures inc infx...,1
37818,126,018,the oil sector is going crazy this is our week...,1
37819,126,019,suffering from pain depression or heartburn we...,1
37820,126,020,u n i v e r s i t y d i p l o m a s do you wan...,1


In [None]:
df_with_stop_words_emails.to_csv('/content/drive/My Drive/197 (ML)/trec06p-cs280/with_stop_words_messages.csv', index=False, escapechar='\\')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
with_stop_words_message_path = '/content/drive/My Drive/197 (ML)/trec06p-cs280/with_stop_words_messages.csv'

Mounted at /content/drive


In [None]:
with_stop_words_data = pd.read_csv(with_stop_words_message_path)
with_stop_words_data

Unnamed: 0,folder,file,message,classification
0,0,0,the mailing list i queried about a few weeks a...,0
1,0,1,luxury watches buy your own rolex for only rol...,1
2,0,2,academic qualifications available from prestig...,1
3,0,3,greetings all this is to verify your subscript...,0
4,0,4,try chauncey may conferred the luscious not co...,1
...,...,...,...,...
37817,126,17,great news expec ted infinex ventures inc infx...,1
37818,126,18,the oil sector is going crazy this is our week...,1
37819,126,19,suffering from pain depression or heartburn we...,1
37820,126,20,u n i v e r s i t y d i p l o m a s do you wan...,1


In [None]:
df_train = with_stop_words_data[with_stop_words_data['folder'] <= 70]
df_test = with_stop_words_data[with_stop_words_data['folder'] > 70]
df_train_ham = df_train[df_train['classification'] == 0]
df_train_spam = df_train[df_train['classification'] == 1]

print("Training Ham: ", len(df_train_ham))
print("Training Spam: ", len(df_train_spam))
print("Testing Set: ", len(df_test))

Training Ham:  7523
Training Spam:  13777
Testing Set:  16522


In [None]:
all_words = []
for message in df_train['message']:
  if isinstance(message, str):
    all_words.extend(message.split())
  else:
    all_words.extend(str(message).split())

word_counts = Counter(all_words)
top_10000_words = word_counts.most_common(10000)
df_top_10000_words = pd.DataFrame(top_10000_words, columns=['Word', 'Frequency'])
df_top_10000_words

Unnamed: 0,Word,Frequency
0,the,131718
1,to,81882
2,a,68020
3,and,63244
4,of,57319
...,...,...
9995,女性会員紹介料,24
9996,お相手の検索,24
9997,連絡先の交換,24
9998,…━…‥…━【高額サポートへの第一歩は,24


### **Creating the feature matrices**

In [None]:
ham_feature_matrix = np.zeros((len(df_train_ham), 10000))
for i, message in enumerate(df_train_ham['message']):
    if isinstance(message, str):
        words = message.split()
        for word in words:
            try:
                word_index = df_top_10000_words[df_top_10000_words['Word'] == word].index[0]
                ham_feature_matrix[i, word_index] = 1
            except IndexError:
                pass
ham_feature_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [None]:
spam_feature_matrix = np.zeros((len(df_train_spam), 10000))
for i, message in enumerate(df_train_spam['message']):
    if isinstance(message, str):
        words = message.split()
        for word in words:
            try:
                word_index = df_top_10000_words[df_top_10000_words['Word'] == word].index[0]
                spam_feature_matrix[i, word_index] = 1
            except IndexError:
                pass
spam_feature_matrix

array([[1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.]])

### **Computing the Priors**

In [None]:
prior_ham = len(df_train_ham) / len(df_train)
prior_spam = len(df_train_spam) / len(df_train)

print("Prior probability of ham:", prior_ham)
print("Prior probability of spam:", prior_spam)

Prior probability of ham: 0.3531924882629108
Prior probability of spam: 0.6468075117370892


### **Computing the likelihood of each word**

In [None]:
likelihood_ham = {}
likelihood_spam = {}
a = 1
V = 10000

wordcount_ham = np.sum(ham_feature_matrix, axis=0)
wordcount_spam = np.sum(spam_feature_matrix, axis=0)

# Convert the top 10k words into a list
top_10000_words_list = [key for key, _ in top_10000_words]

for i, word in enumerate(top_10000_words):
    likelihood_ham[word] = (wordcount_ham[i] + a) / (np.sum(wordcount_ham) + a * V)
    likelihood_spam[word] = (wordcount_spam[i] + a) / (np.sum(wordcount_spam) + a * V)

In [None]:
likelihood_data = []

for i, word in enumerate(top_10000_words_list):
  likelihood_data.append({
      'Word': word,
      'Likelihood (Ham)': likelihood_ham[(word, word_counts[word])],
      'Likelihood (Spam)': likelihood_spam[(word, word_counts[word])]
  })

df_likelihood = pd.DataFrame(likelihood_data)
df_likelihood

Unnamed: 0,Word,Likelihood (Ham),Likelihood (Spam)
0,the,0.009902,0.007842
1,to,0.009405,0.007899
2,a,0.008754,0.007864
3,and,0.008192,0.007976
4,of,0.007749,0.006442
...,...,...,...
9995,女性会員紹介料,0.000001,0.000035
9996,お相手の検索,0.000001,0.000035
9997,連絡先の交換,0.000001,0.000035
9998,…━…‥…━【高額サポートへの第一歩は,0.000001,0.000035


### **Classifying the emails**

In [None]:
def classify_email(email_message, prior_ham, prior_spam, likelihood_ham, likelihood_spam):
  log_likelihood_ham = np.log(prior_ham)
  log_likelihood_spam = np.log(prior_spam)


  if isinstance(email_message, str):
    words = email_message.split()
    for word in words:
      if (word, word_counts[word]) in likelihood_ham:
          log_likelihood_ham += np.log(likelihood_ham[(word, word_counts[word])])


      if (word, word_counts[word]) in likelihood_spam:
          log_likelihood_spam += np.log(likelihood_spam[(word, word_counts[word])])


  if log_likelihood_ham > log_likelihood_spam:
      return 0, log_likelihood_ham, log_likelihood_spam
  else:
      return 1, log_likelihood_ham, log_likelihood_spam

### **Testing the Classifier**

In [None]:
results = []

for _, row in df_test.iterrows():
    predicted_label, log_ham, log_spam = classify_email(row['message'], prior_ham, prior_spam, likelihood_ham, likelihood_spam)
    results.append((row['classification'], predicted_label, log_ham, log_spam))

df_results = pd.DataFrame(results, columns=['True Label', 'Predicted Label', 'Log Likelihood Ham', 'Log Likelihood Spam'])

correctly_classified = df_results[df_results['True Label'] == df_results['Predicted Label']]
incorrectly_classified = df_results[df_results['True Label'] != df_results['Predicted Label']]

print(f"Correctly Classified Emails: {len(correctly_classified)}")
print(f"Incorrectly Classified Emails: {len(incorrectly_classified)}")

Correctly Classified Emails: 15003
Incorrectly Classified Emails: 1519


In [None]:
correctly_classified_percentage = (len(correctly_classified) / len(df_results)) * 100
incorrectly_classified_percentage = (len(incorrectly_classified) / len(df_results)) * 100

print(f"Percentage of Correctly Classified Emails: {correctly_classified_percentage:.2f}%")
print(f"Percentage of Incorrectly Classified Emails: {incorrectly_classified_percentage:.2f}%")

Percentage of Correctly Classified Emails: 90.81%
Percentage of Incorrectly Classified Emails: 9.19%


### **Performance Evaluation**

In [None]:
true_labels = df_results['True Label']
predicted_labels = df_results['Predicted Label']

# Calculate accuracy, recall, and precision
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")

Accuracy: 90.81%
Recall: 88.24%
Precision: 97.91%
