In [9]:
import tarfile
import urllib
from pathlib import Path

In [81]:
import numpy as np
import pandas as pd
import bs4
from sklearn.model_selection import train_test_split

In [10]:
def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "dataset" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [12]:
ham_dir, spam_dir = fetch_spam_data()

In [34]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [35]:
import email
import email.policy

In [36]:
def load_email(filepath):
    with open(filepath, 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [37]:
ham_emails = [load_email(filepaths) for filepaths in ham_filenames]
spam_emails = [load_email(filepaths) for filepaths in spam_filenames]
        

In [38]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [39]:
print(ham_emails[100].get_content().strip())

Vernon,

I'm changing the instructions in the SpamAssassin INSTALL file 
right now to:

tar xfvz dcc-dccproc.tar.Z
cd dcc-dccproc-X.X.X
./configure && make && make install
cdcc 'info'


Let me know ASAP if that's innapropriate, since we're shipping 
2.40 today!

C

On Monday, September 2, 2002, at 10:02  AM, Vernon Schryver wrote:

>> Here are the instructions in the spamassassin README:
>>
>>     # tar xfvz dcc-dccproc.tar.Z
>>     # cd dcc-dccproc-X.X.X
>>     # ./configure && make && make install
>>     # cdcc 'new map'
>>     # cdcc 'add dcc.rhyolite.com'
>>     # cdcc 'info'
>
> That's ok, except that the 'new map' and "add dcc.rhyolite.com'
> are respectively unnecessary and wrong.  The map file that comes
> with the source points to localhost and dcc.dcc-servers.net.  Those
> two shipped entries usually do the right thing if there is a local
> server.  If there is no local server or if the local server fails,
> requests are instantly sent to one of the public server names listed

some emails are actually multi-part, with images and attachments. Let us look at various types of structures we have

In [45]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return f"multipart {multipart}"
    else:
        return email.get_content_type()

In [46]:
from collections import Counter

In [47]:
def structure_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1

    return structures

In [48]:
structure_counter(ham_emails)

Counter({'text/plain': 2408,
         'multipart text/plain, application/pgp-signature': 66,
         'multipart text/plain, text/html': 8,
         'multipart text/plain, text/plain': 4,
         'multipart text/plain': 3,
         'multipart text/plain, application/octet-stream': 2,
         'multipart text/plain, text/enriched': 1,
         'multipart text/plain, application/ms-tnef, text/plain': 1,
         'multipart multipart text/plain, text/plain, text/plain, application/pgp-signature': 1,
         'multipart text/plain, video/mng': 1,
         'multipart text/plain, multipart text/plain': 1,
         'multipart text/plain, application/x-pkcs7-signature': 1,
         'multipart text/plain, multipart text/plain, text/plain, text/rfc822-headers': 1,
         'multipart text/plain, multipart text/plain, text/plain, multipart multipart text/plain, application/x-pkcs7-signature': 1,
         'multipart text/plain, application/x-java-applet': 1})

In [44]:
structure_counter(spam_emails)

Counter({'text/plain': 218,
         'text/html': 183,
         'multiparttext/plain, text/html': 45,
         'multiparttext/html': 20,
         'multiparttext/plain': 19,
         'multipartmultiparttext/html': 5,
         'multiparttext/plain, image/jpeg': 3,
         'multiparttext/html, application/octet-stream': 2,
         'multiparttext/plain, application/octet-stream': 1,
         'multiparttext/html, text/plain': 1,
         'multipartmultiparttext/html, application/octet-stream, image/jpeg': 1,
         'multipartmultiparttext/plain, text/html, image/gif': 1,
         'multipart/alternative': 1})

It looks like the ham emails are more often plain text, while spamhas quiet lot of a HTML. Moreover, quiet a few ham emails are signed using PGP, while no spam is.

## Let us look at the email headers

In [67]:
for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [70]:
spam_emails[0]['Subject']

'Life Insurance - Why Pay More?'

## Train Test Split

In [73]:
X = np.array(ham_emails + spam_emails, dtype=object)

In [74]:
y = np.array()

array([<email.message.EmailMessage object at 0x111542470>,
       <email.message.EmailMessage object at 0x114005810>,
       <email.message.EmailMessage object at 0x114006830>, ...,
       <email.message.EmailMessage object at 0x112aa8130>,
       <email.message.EmailMessage object at 0x112aa8610>,
       <email.message.EmailMessage object at 0x112aa9a50>], dtype=object)

In [76]:
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [80]:
# X = X_train and y = y_train
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## lets select all the emails with html for the train set

In [82]:
html_spam_emails = [email for email in X[y == 1] if get_email_structure(email) == "text/html"]

In [98]:
def email_to_text(email):
    html = None
    for part in email.walk():
        content_type_ = part.get_content_type()
        if not content_type_ in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if content_type_ == "text/plain":
            return content
        else:
            html = content
        if html:
            return bs4.BeautifulSoup(html, 'html.parser').text
                                 
                                 
    

In [109]:
print(email_to_text(X[8]))

Once upon a time, ""Angles" wrote :

> Matthias Saou (matthias@egwn.net) wrote*:
> >You're really better off backuping all placed where you know you've hand
> >edited or installed some files. For me that's only /etc/, /root/ and
> >/home/. Then you reinstall cleanly, formating "/", put your /home/ files
> >back into place and you're ready to go.
> 
> Matthias I gotta believe you, I've been using your RPMs for some time now
> :) That's the way I'll do it.

I'm no "messiah", just do what you think suits you the best :-)

Matthias

-- 
Clean custom Red Hat Linux rpm packages : http://freshrpms.net/
Red Hat Linux release 7.3 (Valhalla) running Linux kernel 2.4.18-10acpi
Load : 0.05 0.06 0.03

_______________________________________________
RPM-List mailing list <RPM-List@freshrpms.net>
http://lists.freshrpms.net/mailman/listinfo/rpm-list





In [124]:
import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute",
             "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


# let us build a pipeline


In [117]:
from sklearn.base import BaseEstimator, TransformerMixin
import re
import urlextract

In [125]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, replace_urls=True, replace_numbers=True, stemming=True ):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming

    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email_ in X:
            text_ = email_to_text(email_) or ""
            if self.lower_case:
                text_ = text_.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text_)))
                urls.sort(key= lambda url: len(url), reverse=True)
                for url in urls:
                    text_ = text_.replace(url, " URL ")
            if self.replace_numbers:
                text_ = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text_)
            if self.remove_punctuation:
                text_ = re.sub(r'\W+', ' ', text_, flags=re.M)
            word_counts = Counter(text_.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [126]:
url_extractor = urlextract.URLExtract()

X_few = X[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X)
X_few_wordcounts

array([Counter({'the': 14, 'i': 10, 'number': 9, 'server': 5, 'and': 5, 'a': 4, 'of': 4, 'is': 4, 'to': 3, 'it': 3, 'on': 3, 'when': 3, 'get': 3, 'as': 3, 'url': 3, 'numbermbp': 3, 'list': 3, 'but': 2, 'your': 2, 'you': 2, 'don': 2, 't': 2, 'that': 2, 'too': 2, 'mirror': 2, 'ftp': 2, 'which': 2, 'use': 2, 'system': 2, 'matthia': 2, 'rpm': 2, 'onc': 1, 'upon': 1, 'time': 1, 'brian': 1, 'wrote': 1, 'yeah': 1, 'tri': 1, 'take': 1, 'easi': 1, 'golden': 1, 'rule': 1, 'internet': 1, 'find': 1, 'free': 1, 'resourc': 1, 'piss': 1, 'em': 1, 'off': 1, 'realli': 1, 'appreci': 1, 'work': 1, 'done': 1, 'thing': 1, 'wish': 1, 'could': 1, 'respect': 1, 'worri': 1, 'much': 1, 'day': 1, 'busi': 1, 'll': 1, 'cleanup': 1, 'publish': 1, 'address': 1, 'some': 1, 'mani': 1, 'exist': 1, 'm': 1, 'awar': 1, 'at': 1, 'least': 1, 'even': 1, 'an': 1, 'apt': 1, 'one': 1, 'for': 1, 'now': 1, 'limit': 1, 'far': 1, 'enough': 1, 'unlock': 1, 'psych': 1, 'iso': 1, 'imag': 1, 'monday': 1, 'bandwidth': 1, 'usag': 1, 'cur

In [132]:
X_few_wordcounts[0]

Counter({'the': 14,
         'i': 10,
         'number': 9,
         'server': 5,
         'and': 5,
         'a': 4,
         'of': 4,
         'is': 4,
         'to': 3,
         'it': 3,
         'on': 3,
         'when': 3,
         'get': 3,
         'as': 3,
         'url': 3,
         'numbermbp': 3,
         'list': 3,
         'but': 2,
         'your': 2,
         'you': 2,
         'don': 2,
         't': 2,
         'that': 2,
         'too': 2,
         'mirror': 2,
         'ftp': 2,
         'which': 2,
         'use': 2,
         'system': 2,
         'matthia': 2,
         'rpm': 2,
         'onc': 1,
         'upon': 1,
         'time': 1,
         'brian': 1,
         'wrote': 1,
         'yeah': 1,
         'tri': 1,
         'take': 1,
         'easi': 1,
         'golden': 1,
         'rule': 1,
         'internet': 1,
         'find': 1,
         'free': 1,
         'resourc': 1,
         'piss': 1,
         'em': 1,
         'off': 1,
         'realli': 1,
     

In [128]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1
                            for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)),
                          shape=(len(X), self.vocabulary_size + 1))

In [129]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 16204 stored elements and shape (1920, 11)>

In [130]:
X_few_vectors.toarray()

array([[171,   9,  14, ...,   1,   3,   4],
       [ 58,   0,   3, ...,   0,   1,   1],
       [145,  10,   9, ...,   3,   3,   1],
       ...,
       [ 17,   6,   1, ...,   0,   2,   0],
       [166,   4,   6, ...,   0,   3,   0],
       [282,   7,   9, ...,   5,   2,   1]])

In [131]:
vocab_transformer.vocabulary_

{'number': 1,
 'the': 2,
 'to': 3,
 'a': 4,
 'and': 5,
 'of': 6,
 'i': 7,
 'in': 8,
 'url': 9,
 'is': 10}

In [134]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X)

In [135]:
X_train_transformed

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 149590 stored elements and shape (1920, 1001)>

In [137]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y, cv=3)
score.mean()

0.9812500000000001

In [139]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y)

y_pred = log_clf.predict(X_test_transformed)

print(f"Precision: {precision_score(y_test, y_pred):.2%}")
print(f"Recall: {recall_score(y_test, y_pred):.2%}")

Precision: 98.73%
Recall: 95.12%
