In [1]:
import os
import random
import tarfile
import pandas as pd
from bs4 import BeautifulSoup
import re
from tqdm import tqdm 



  from pandas.core import (


### Dataset Creation

In [2]:
# Read labels
labels_path = 'trec07p/full/index'
emails_dir = 'trec07p/data'

labels = []
with open(labels_path, 'r') as file:
    for line in file:
        split_line = line.strip().split()
        label = split_line[0]
        file_name = split_line[1].split('/')[-1]
        labels.append((file_name, label))



In [3]:
labels

[('inmail.1', 'spam'),
 ('inmail.2', 'ham'),
 ('inmail.3', 'spam'),
 ('inmail.4', 'spam'),
 ('inmail.5', 'spam'),
 ('inmail.6', 'spam'),
 ('inmail.7', 'spam'),
 ('inmail.8', 'spam'),
 ('inmail.9', 'spam'),
 ('inmail.10', 'ham'),
 ('inmail.11', 'spam'),
 ('inmail.12', 'spam'),
 ('inmail.13', 'spam'),
 ('inmail.14', 'spam'),
 ('inmail.15', 'spam'),
 ('inmail.16', 'spam'),
 ('inmail.17', 'spam'),
 ('inmail.18', 'spam'),
 ('inmail.19', 'spam'),
 ('inmail.20', 'ham'),
 ('inmail.21', 'ham'),
 ('inmail.22', 'spam'),
 ('inmail.23', 'spam'),
 ('inmail.24', 'spam'),
 ('inmail.25', 'spam'),
 ('inmail.26', 'spam'),
 ('inmail.27', 'spam'),
 ('inmail.28', 'spam'),
 ('inmail.29', 'ham'),
 ('inmail.30', 'spam'),
 ('inmail.31', 'ham'),
 ('inmail.32', 'spam'),
 ('inmail.33', 'spam'),
 ('inmail.34', 'ham'),
 ('inmail.35', 'spam'),
 ('inmail.36', 'spam'),
 ('inmail.37', 'spam'),
 ('inmail.38', 'spam'),
 ('inmail.39', 'spam'),
 ('inmail.40', 'spam'),
 ('inmail.41', 'spam'),
 ('inmail.42', 'spam'),
 ('inmai

In [4]:
def clean_html(html):
    text = re.sub(r'<[^>]*>', '', html)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [5]:
def preprocess_text(text, stopwords):
    
    text = text.lower()
    
    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    
    
    text = ' '.join(filtered_words)
    
    return text

In [22]:
def extract_text(document_content, stopwords):
    html_pattern1 = r'<html>(.*?)</html>'
    html_pattern2 = r'<HTML>(.*?)</HTML>'
    match1 = re.search(html_pattern1, document_content, re.DOTALL | re.IGNORECASE)
    match2 = re.search(html_pattern2, document_content, re.DOTALL | re.IGNORECASE)
    if match1:
        html_content = match1.group(1)
        extracted_text = clean_html(match1.group(1).strip())     
    elif match2:
        html_content = match1.group(1)
        extracted_text = clean_html(match1.group(1).strip())     
    else:
        lines_pattern = r'Lines:\s*\d+\s*(.*)'
        match = re.search(lines_pattern, document_content, re.DOTALL | re.IGNORECASE)
        if match:
            extracted_text = match.group(1).strip()
        else:
            extracted_text = ''
    
    # Preprocess the extracted text
    preprocessed_text = preprocess_text(extracted_text, stopwords)
    
    return preprocessed_text

In [23]:
with open('stoplist.txt', 'r') as file:
        stopwords = set(file.read().strip().splitlines())

In [24]:
# Process each email
data = []
i = 0
for file_name, label in tqdm(labels, desc = "Parsing"):
    file_path = os.path.join(emails_dir, file_name)
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        content = file.read()
    content = extract_text(content, stopwords)
    data.append({'text': content, 'label': 'spam' if label == 'spam' else 'ham'})    
    

Parsing: 100%|████████████████████████████| 75419/75419 [00:29<00:00, 2519.07it/s]


In [25]:
# Convert to DataFrame
df = pd.DataFrame(data)
df.head()


Unnamed: 0,text,label
0,feel pressure perform rising occasion try viag...,spam
1,hi ive updated gulus check mirrors typo debian...,ham
2,authentic viagra mega authenticv g r discount ...,spam
3,hey billy fun going night talking felt insecur...,spam
4,system home h ave capabilities linked know wit...,spam


In [28]:
# Connecting to elastic search
from elasticsearch7 import Elasticsearch
from elasticsearch7.client import IndicesClient



es = Elasticsearch("http://localhost:9200")
ic = IndicesClient(es)
print(es.ping())

True


#### Creating Index

In [30]:
import random


index_name = "spam_data"
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name)


for index, row in tqdm(df.iterrows(), desc="Creating Index"):
    
    soup = BeautifulSoup(row["text"], "html.parser")
    clean_text = soup.get_text()
    
   

    label = "yes" if row["label"] == "spam" else "no"
    
    
    split = "train" if random.random() < 0.8 else "test"
    
    
    document = {
        "text": clean_text,
        "label": label,
        "split": split
    }
    
    # Index the document in Elasticsearch
    es.index(index=index_name, body=document)

  es.index(index=index_name, body=document)
Creating Index: 75419it [06:18, 199.20it/s]


## Part 1: Manual Spam Features

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')
#Trial A
manual_ngrams = ["free", "win", "porn", "click here"]



In [90]:
#Trial B

file_path = 'ngrams.txt'


provided_ngrams = []


with open(file_path, 'r') as file:
    for line in file:
        
        stripped_line = line.strip()
        
        provided_ngrams.append(stripped_line)


print(provided_ngrams)
provided_ngrams = list(set(provided_ngrams))

['free', 'spam', 'click', 'buy', 'clearance', 'shopper', 'order', 'earn', 'cash', 'extra', 'money', 'double', 'collect', 'credit', 'check', 'affordable', 'fast', 'price', 'loans', 'profit', 'refinance', 'hidden', 'freedom', 'chance', 'miracle', 'lose', 'home', 'remove', 'success', 'virus', 'malware', 'ad', 'subscribe', 'sales', 'performance', 'viagra', 'valium', 'medicine', 'diagnostics', 'million', 'join', 'deal', 'unsolicited', 'trial', 'prize', 'now', 'legal', 'bonus', 'limited', 'instant', 'luxury', 'celebrity', 'only', 'compare', 'win', 'viagra', '$$$', '$discount', 'click here', 'meet singles', 'incredible deal', 'lose weight', 'act now', '100% free', 'fast cash', 'million dollars', 'lower interest rate', 'visit our website', 'no credit check']


In [91]:
def create_feature_matrix(ngrams):
    vectorizer = CountVectorizer(vocabulary=ngrams)
    
    query = {
        "query": {
            "match_all": {}
        }
    }
    
    results = es.search(index=index_name, body=query, scroll="1m", size=1000)
    scroll_id = results["_scroll_id"]
    hits = results["hits"]["hits"]
    
    texts = []
    labels = []
    
    while len(hits) > 0:
        texts.extend([doc["_source"]["text"] for doc in hits])
        labels.extend([doc["_source"]["label"] for doc in hits])
        
        results = es.scroll(scroll_id=scroll_id, scroll="1m")
        scroll_id = results["_scroll_id"]
        hits = results["hits"]["hits"]
    
    feature_matrix = vectorizer.transform(texts)
    
    return feature_matrix, labels

In [92]:
# Function to train and evaluate a model
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label="yes")
    recall = recall_score(y_test, y_pred, pos_label="yes")
    f1 = f1_score(y_test, y_pred, pos_label="yes")
    

    return accuracy, precision, recall, f1

In [93]:
# Trial A: Manual ngrams
print("Trial A: Manual ngrams")
X, y = create_feature_matrix(manual_ngrams)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# decision tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracy, dt_precision, dt_recall, dt_f1 = train_and_evaluate(dt_model, X_train, y_train, X_test, y_test)

# logistic regression
lr_model = LogisticRegression(random_state=42)
lr_accuracy, lr_precision, lr_recall, lr_f1 = train_and_evaluate(lr_model, X_train, y_train, X_test, y_test)

#  Naive Bayes
nb_model = MultinomialNB()
nb_accuracy, nb_precision, nb_recall, nb_f1 = train_and_evaluate(nb_model, X_train, y_train, X_test, y_test)

print("Decision Tree: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    dt_accuracy, dt_precision, dt_recall, dt_f1))
print("Logistic Regression: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    lr_accuracy, lr_precision, lr_recall, lr_f1))
print("Naive Bayes: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    nb_accuracy, nb_precision, nb_recall, nb_f1))



Trial A: Manual ngrams
Decision Tree: Accuracy=0.67, Precision=0.67, Recall=1.00, F1-score=0.80
Logistic Regression: Accuracy=0.66, Precision=0.66, Recall=1.00, F1-score=0.80
Naive Bayes: Accuracy=0.66, Precision=0.66, Recall=1.00, F1-score=0.80


In [94]:
# Trial B: Provided ngrams
print("\nTrial B: Provided ngrams")
X, y = create_feature_matrix(provided_ngrams)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# decision tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracy, dt_precision, dt_recall, dt_f1 = train_and_evaluate(dt_model, X_train, y_train, X_test, y_test)

#  logistic regression
lr_model = LogisticRegression(random_state=42)
lr_accuracy, lr_precision, lr_recall, lr_f1 = train_and_evaluate(lr_model, X_train, y_train, X_test, y_test)

#  Naive Bayes
nb_model = MultinomialNB()
nb_accuracy, nb_precision, nb_recall, nb_f1 = train_and_evaluate(nb_model, X_train, y_train, X_test, y_test)

print("Decision Tree: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    dt_accuracy, dt_precision, dt_recall, dt_f1))
print("Logistic Regression: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    lr_accuracy, lr_precision, lr_recall, lr_f1))
print("Naive Bayes: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    nb_accuracy, nb_precision, nb_recall, nb_f1))


Trial B: Provided ngrams
Decision Tree: Accuracy=0.70, Precision=0.70, Recall=0.98, F1-score=0.81
Logistic Regression: Accuracy=0.68, Precision=0.68, Recall=0.96, F1-score=0.80
Naive Bayes: Accuracy=0.67, Precision=0.69, Recall=0.91, F1-score=0.78


#### Top Spam Unigrams

In [111]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, vectorizer):
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label="yes")
    recall = recall_score(y_test, y_pred, pos_label="yes")
    f1 = f1_score(y_test, y_pred, pos_label="yes")
    
    feature_names = vectorizer.get_feature_names_out()
    
    if isinstance(model, LogisticRegression):
        coef = model.coef_[0]
    elif isinstance(model, MultinomialNB):
        coef = model.feature_log_prob_[1]
    else:
        coef = model.feature_importances_
    
    top_spam_unigrams = [feature_names[i] for i in coef.argsort()[::-1][:10]]
    
    return accuracy, precision, recall, f1, top_spam_unigrams

# Trial A: Manual ngrams
print("Trial A: Manual ngrams")
X, y, texts, vectorizer = create_feature_matrix(manual_ngrams)
X_train, X_test, y_train, y_test, texts_train, texts_test = train_test_split(X, y, texts, test_size=0.2, random_state=42)

# Train and evaluate decision tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_top_unigrams = train_and_evaluate(dt_model, X_train, y_train, X_test, y_test, vectorizer)

# Train and evaluate logistic regression
lr_model = LogisticRegression(random_state=42)
lr_accuracy, lr_precision, lr_recall, lr_f1, lr_top_unigrams = train_and_evaluate(lr_model, X_train, y_train, X_test, y_test, vectorizer)

# Train and evaluate Naive Bayes
nb_model = MultinomialNB()
nb_accuracy, nb_precision, nb_recall, nb_f1, nb_top_unigrams = train_and_evaluate(nb_model, X_train, y_train, X_test, y_test, vectorizer)

print("Decision Tree: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    dt_accuracy, dt_precision, dt_recall, dt_f1))
print("Top Spam Unigrams (Decision Tree):", dt_top_unigrams)

print("Logistic Regression: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    lr_accuracy, lr_precision, lr_recall, lr_f1))
print("Top Spam Unigrams (Logistic Regression):", lr_top_unigrams)

print("Naive Bayes: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    nb_accuracy, nb_precision, nb_recall, nb_f1))
print("Top Spam Unigrams (Naive Bayes):", nb_top_unigrams)


Trial A: Manual ngrams
Decision Tree: Accuracy=0.67, Precision=0.67, Recall=1.00, F1-score=0.80
Top Spam Unigrams (Decision Tree): ['free', 'win', 'porn', 'click here']
Logistic Regression: Accuracy=0.66, Precision=0.66, Recall=1.00, F1-score=0.80
Top Spam Unigrams (Logistic Regression): ['porn', 'click here', 'free', 'win']
Naive Bayes: Accuracy=0.66, Precision=0.66, Recall=1.00, F1-score=0.80
Top Spam Unigrams (Naive Bayes): ['free', 'win', 'porn', 'click here']


In [112]:

# Trial B: Provided ngrams
print("\nTrial B: Provided ngrams")
X, y, texts, vectorizer = create_feature_matrix(provided_ngrams)
X_train, X_test, y_train, y_test, texts_train, texts_test = train_test_split(X, y, texts, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_top_unigrams = train_and_evaluate(dt_model, X_train, y_train, X_test, y_test, vectorizer)


lr_model = LogisticRegression(random_state=42)
lr_accuracy, lr_precision, lr_recall, lr_f1, lr_top_unigrams = train_and_evaluate(lr_model, X_train, y_train, X_test, y_test, vectorizer)


nb_model = MultinomialNB()
nb_accuracy, nb_precision, nb_recall, nb_f1, nb_top_unigrams = train_and_evaluate(nb_model, X_train, y_train, X_test, y_test, vectorizer)

print("Decision Tree: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    dt_accuracy, dt_precision, dt_recall, dt_f1))
print("Top Spam Unigrams (Decision Tree):", dt_top_unigrams)

print("Logistic Regression: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    lr_accuracy, lr_precision, lr_recall, lr_f1))
print("Top Spam Unigrams (Logistic Regression):", lr_top_unigrams)

print("Naive Bayes: Accuracy={:.2f}, Precision={:.2f}, Recall={:.2f}, F1-score={:.2f}".format(
    nb_accuracy, nb_precision, nb_recall, nb_f1))
print("Top Spam Unigrams (Naive Bayes):", nb_top_unigrams)


Trial B: Provided ngrams
Decision Tree: Accuracy=0.70, Precision=0.70, Recall=0.98, F1-score=0.81
Top Spam Unigrams (Decision Tree): ['money', 'click', 'price', 'buy', 'viagra', 'subscribe', 'check', 'free', 'home', 'fast']
Logistic Regression: Accuracy=0.68, Precision=0.68, Recall=0.96, F1-score=0.80
Top Spam Unigrams (Logistic Regression): ['refinance', 'viagra', 'money', 'bonus', 'buy', 'instant', 'shopper', 'earn', 'lose', 'fast']
Naive Bayes: Accuracy=0.67, Precision=0.69, Recall=0.91, F1-score=0.78
Top Spam Unigrams (Naive Bayes): ['price', 'money', 'viagra', 'buy', 'click', 'free', 'fast', 'check', 'order', 'home']


## Part 2: All unigrams as features 

In [95]:


# Function to extract all unigrams from documents
def extract_unigrams(text):
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    
    unigrams = text.split()
    return unigrams


In [96]:
from scipy.sparse import save_npz, load_npz
import numpy as np
# Function to create sparse feature matrices using Elasticsearch
def create_feature_matrices():
    
    query = {
        "query": {
            "match_all": {}
        }
    }
    results = es.search(index=index_name, body=query, scroll="1m", size=1000)
    scroll_id = results["_scroll_id"]
    hits = results["hits"]["hits"]
    
    texts = []
    labels = []
    
    while len(hits) > 0:
        texts.extend([doc["_source"]["text"] for doc in hits])
        labels.extend([doc["_source"]["label"] for doc in hits])
        
        results = es.scroll(scroll_id=scroll_id, scroll="1m")
        scroll_id = results["_scroll_id"]
        hits = results["hits"]["hits"]
    
    
    vectorizer = CountVectorizer(analyzer=extract_unigrams)
    
    
    vectorizer.fit(texts)
    
    
    X = vectorizer.transform(texts)
    y = labels  
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    save_npz("X_train.npz", X_train)
    save_npz("X_test.npz", X_test)
    np.save("y_train.npy", y_train)
    np.save("y_test.npy", y_test)
    
    return vectorizer



In [102]:
# Function to train and evaluate the model
def train_and_evaluate(vectorizer):
    
    X_train = load_npz("X_train.npz")
    X_test = load_npz("X_test.npz")
    y_train = np.load("y_train.npy")
    y_test = np.load("y_test.npy")
    
    
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    precision = precision_score(y_test, y_pred, pos_label="yes")
    recall = recall_score(y_test, y_pred, pos_label="yes")
    f1 = f1_score(y_test, y_pred, pos_label="yes")
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    
    feature_names = vectorizer.get_feature_names_out()
    coef = model.coef_[0]
    top_spam_unigrams = [feature_names[i] for i in coef.argsort()[::-1][:10]]
    
    return top_spam_unigrams



In [98]:
# Create sparse feature matrices
vectorizer = create_feature_matrices()

In [103]:
# Train and evaluate the model
top_spam_unigrams = train_and_evaluate(vectorizer)

print("Top Spam Unigrams:", top_spam_unigrams)

Accuracy: 0.8121188013789445
Precision: 0.9928688974218322
Recall: 0.7224106964677709
F1-score: 0.8363174309807092
Top Spam Unigrams: ['click', 'viagra', 'symbol', 'medication', 'girl', 'adf', 'vbscom', 'httpcsmonet', 'hot', 'producttestpanelspeedyuwaterlooca']
