# SpamAssassin Public Mail Corpus

https://spamassassin.apache.org/old/publiccorpus/readme.html

# Imports

In [1]:
import email
import os
import tarfile

import numpy as np
import pandas as pd

from IPython.display import display

# Parameters

In [2]:
data_dirname = "../SpamAssassin/original"
results_dirname = "../SpamAssassin"
results_basename = "data.csv.gz"

# Data Cleaning

In [3]:
def extract_email_body(fp, name=None):
    message = email.message_from_binary_file(fp)
    for part in message.walk():
        content_type = part.get_content_type()
        content_disposition = str(part.get("Content-Disposition"))
        if content_type == "text/plain" and "attachment" not in content_disposition:
            return part.get_payload()

df_list = list()
for basename in os.listdir(data_dirname):
    if basename.endswith(".tar.bz2"):
        filename = os.path.join(data_dirname, basename)
        print(filename)
        with tarfile.open(filename, "r:bz2") as tar:
            data = list()
            index = list()
            for member in tar:
                fp = tar.extractfile(member)
                if fp is not None:
                    index.append(member.name)
                    data.append( extract_email_body(fp) )
            df_list.append( pd.DataFrame(data, columns=["text"], index=index) )
data_df = pd.concat(df_list).fillna(value=np.nan)
data_df["spam"] = data_df.index.map(
    lambda index: "spam" in index
)

with pd.option_context("display.max_rows", 10, "display.max_columns", None):
    display(data_df)

../SpamAssassin/original/20030228_easy_ham.tar.bz2
../SpamAssassin/original/20030228_easy_ham_2.tar.bz2
../SpamAssassin/original/20030228_spam.tar.bz2
../SpamAssassin/original/20050311_spam_2.tar.bz2
../SpamAssassin/original/20030228_hard_ham.tar.bz2


Unnamed: 0,text,spam
easy_ham/00001.7c53336b37003a9286aba55d2945844c,"Date: Wed, 21 Aug 2002 10:54:46 -05...",False
easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac,"Martin A posted:\nTassos Papadopoulos, the Gre...",False
easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7,Man Threatens Explosion In Moscow \n\nThursday...,False
easy_ham/00004.864220c5b6930b209cc287c361c99af1,Klez: The Virus That Won't Die\n \nAlready the...,False
easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613,"> in adding cream to spaghetti carbonara, whi...",False
...,...,...
hard_ham/00247.42534d5df0700cb2adf240556c539947,___ ___ ...,False
hard_ham/00248.9599b06d2d2c08b57ff1de06316d66c0,IN THIS ISSUE:\n\n01. Readers write\n02. Exten...,False
hard_ham/00249.b9183324a9726e8b6c8779045a921243,\nSay goodbye to paper.\n\nDear GlobalSCAPE Cu...,False
hard_ham/00250.c7603b27a45284d12b49adf767b2b6fa,<head>\n<title>FC Sporadic</title>\n<style>\nT...,False


# Save Data

In [4]:
filename = os.path.join(results_dirname, results_basename)
print(filename)
data_df.to_csv(filename, index=False, compression="gzip")

../SpamAssassin/data.csv.gz
