In [52]:
from pathlib import Path
import tarfile
import pandas as pd

def load_data():
    tar_files = ["./20050311_spam_2.tar.bz2",
                 "./20021010_easy_ham.tar.bz2",
                 "./20021010_hard_ham.tar.bz2",
                 "./20030228_easy_ham_2.tar.bz2",
                 "./20030228_spam.tar.bz2",
                 "./20030228_spam_2.tar.bz2",]
    for tar_file in tar_files:
        with tarfile.open(tar_file) as tarball:
            tarball.extractall(path="./", filter = "data")
    

load_data()

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
import numpy as np
folder_paths = ["./easy_ham",
           "./spam_2",
           "./easy_ham_2",
           "./hard_ham",
           "./spam"]
mails = []
labels = []
vectorizer = TfidfVectorizer(stop_words = "english")
for folder_path in folder_paths:
    folder = Path(folder_path)
    for file in list(folder.iterdir()):
        with open(file, "r", encoding = "utf-8", errors = "ignore") as f:
            mails.append(f.read())
            if "spam" in folder_path:
                labels.append(1)
            else:
                labels.append(0)
vector_mail = vectorizer.fit_transform(mails)
vector_mail, labels = shuffle(vector_mail, np.array(labels), random_state = 42)

In [62]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_score, recall_score, accuracy_score
mail_test, mail_train, label_test, label_train = vector_mail[:1000], vector_mail[1000:], labels[:1000], labels[1000:]
knn_clf = KNeighborsClassifier()
predictions = cross_val_predict(knn_clf, mail_train, label_train, cv = 3, n_jobs = -1)
p_score = precision_score(label_train, predictions)
r_score = recall_score(label_train, predictions)
a_score = accuracy_score(label_train, predictions)
print(f"precision = {p_score}, recall = {r_score}, accuracy = {a_score}")

precision = 0.9667571234735414, recall = 0.8962264150943396, accuracy = 0.9580474416781023


In [63]:
knn_clf.fit(mail_test, label_test)
y_pred = knn_clf.predict(mail_test)

acc = accuracy_score(label_test, y_pred)        # accuracy on test set
prec = precision_score(label_test, y_pred)
rec = recall_score(label_test, y_pred)

print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}")

Accuracy: 0.954, Precision: 0.978, Recall: 0.871
