In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
import os
import urllib
import bs4
import tarfile
import email
import email.policy
import nltk
import urlextract
import re

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score

In [2]:
DOWNLOAD_ROOT = 'http://spamassassin.apache.org/old/publiccorpus/'
HAM_URL = DOWNLOAD_ROOT + '20030228_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + '20030228_spam.tar.bz2'
SPAM_PATH = os.path.join('datasets', 'spam')

def fetch_spam_data(spam_url = SPAM_URL, spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path = SPAM_PATH)
        tar_bz2_file.close()

In [3]:
fetch_spam_data()

In [4]:
# Load all the emails
HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH, 'spam')
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [5]:
len(ham_filenames)

2500

In [6]:
len(spam_filenames)

500

In [7]:
ham_filenames[0]

'00001.7c53336b37003a9286aba55d2945844c'

In [8]:
with open(os.path.join(HAM_DIR, ham_filenames[0]), 'rb') as f:
    print(email.parser.BytesParser(policy = email.policy.default).parse(f))

Return-Path: <exmh-workers-admin@spamassassin.taint.org>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received: from listman.spamassassin.taint.org (listman.spamassassin.taint.org
 [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id
 g7MBYrZ04811 for    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002
 12:34:53 +0100
Received: from listman.spamassassin.taint.org (localhost.localdomain
 [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id 8386540858;
 Thu, 22 Aug 2002    07:35:02 -0400 (EDT)
Delivered-To: exmh-workers@listman.spamassassin.taint.org
Received: from int-mx1.corp.spamassassin.taint.org
 (int-mx1.corp.spamassassin.taint.or

In [9]:
with open(os.path.join(SPAM_DIR, spam_filenames[0]), 'rb') as f:
    print(email.parser.BytesParser(policy = email.policy.default).parse(f))

Return-Path: <12a1mailbot1@web.de>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32
	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]
	by localhost with POP3 (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received: from dd_it7 ([210.97.77.167])
	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623
	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From: 12a1mailbot1@web.de
Received: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft
 SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To: <dcek1a1@netsgo.com>
Subject: Life Insurance - Why Pay More?
Date: Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version: 1.0
Message-ID: <0103c1042001882DD_IT7@dd_it7>
Content-Type: text/html; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-

In [10]:
def load_email(is_spam, filename):
    directory = SPAM_DIR if is_spam else HAM_DIR
    with open(os.path.join(directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy = email.policy.default).parse(f)
            

In [11]:
ham_emails = [load_email(is_spam = False, filename = name) for name in ham_filenames]
spam_emails = [load_email(is_spam = True, filename = name) for name in spam_filenames]

In [12]:
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [13]:
print(spam_emails[0].get_content().strip())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

In [14]:
ham_multipart = [email.is_multipart() for email in ham_emails]
ham_multipart_count = np.sum(ham_multipart)
spam_multipart = [email.is_multipart() for email in spam_emails]
spam_multipart_count = np.sum(spam_multipart)
print("Ham has {} multipart emails from {}".format(ham_multipart_count, len(ham_emails)))
print("Spam has {} multipart emails from {}".format(spam_multipart_count, len(spam_emails)))

Ham has 92 multipart emails from 2500
Spam has 98 multipart emails from 500


In [15]:
# Ham Payload
ham_payload = ham_emails[0].get_payload()
ham_payload

'    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can\'t reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat\'s where the "1 hit" comes from (obviously).  The version of nmh I\'m\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.c

In [16]:
# Spam Payload
spam_payload = spam_emails[0].get_payload()
spam_payload

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\nype>\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\nules=3Dnone \nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3D3>\n      <HR color=3Dblack noShade SIZE=3D1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3D3>\n      <HR color=3Dblack noShade SIZE=3D1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n --><FONT \ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\n0000 \nface=3D"Copperplate Gothic Bold" size=3D5 P

In [17]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return("multipart({})".format(
              [get_email_structure(sub_email) for sub_email in payload]))
    else:
        return(email.get_content_type())

In [18]:
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [19]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ("multipart(['text/plain', 'application/pgp-signature'])", 66),
 ("multipart(['text/plain', 'text/html'])", 8),
 ("multipart(['text/plain', 'text/plain'])", 4),
 ("multipart(['text/plain'])", 3),
 ("multipart(['text/plain', 'application/octet-stream'])", 2),
 ("multipart(['text/plain', 'text/enriched'])", 1),
 ("multipart(['text/plain', 'application/ms-tnef', 'text/plain'])", 1),
 ('multipart(["multipart([\'text/plain\', \'text/plain\', \'text/plain\'])", \'application/pgp-signature\'])',
  1),
 ("multipart(['text/plain', 'video/mng'])", 1),
 ('multipart([\'text/plain\', "multipart([\'text/plain\'])"])', 1),
 ("multipart(['text/plain', 'application/x-pkcs7-signature'])", 1),
 ('multipart([\'text/plain\', "multipart([\'text/plain\', \'text/plain\'])", \'text/rfc822-headers\'])',
  1),
 ('multipart([\'text/plain\', "multipart([\'text/plain\', \'text/plain\'])", \'multipart(["multipart([\\\'text/plain\\\', \\\'application/x-pkcs7-signature\\\'])"])\'])',
  1),
 ("m

In [20]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ("multipart(['text/plain', 'text/html'])", 45),
 ("multipart(['text/html'])", 20),
 ("multipart(['text/plain'])", 19),
 ('multipart(["multipart([\'text/html\'])"])', 5),
 ("multipart(['text/plain', 'image/jpeg'])", 3),
 ("multipart(['text/html', 'application/octet-stream'])", 2),
 ("multipart(['text/plain', 'application/octet-stream'])", 1),
 ("multipart(['text/html', 'text/plain'])", 1),
 ('multipart(["multipart([\'text/html\'])", \'application/octet-stream\', \'image/jpeg\'])',
  1),
 ('multipart(["multipart([\'text/plain\', \'text/html\'])", \'image/gif\'])',
  1),
 ('multipart/alternative', 1)]

In [21]:
spam_emails[0].items()

[('Return-Path', '<12a1mailbot1@web.de>'),
 ('Delivered-To', 'zzzz@localhost.spamassassin.taint.org'),
 ('Received',
  'from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)'),
 ('Received',
  'from mail.webnote.net [193.120.211.219]\tby localhost with POP3 (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)'),
 ('Received',
  'from dd_it7 ([210.97.77.167])\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\tfor <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100'),
 ('From', '12a1mailbot1@web.de'),
 ('Received',
  'from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);\t Sat, 24 Aug 2002 09:42:10 +0900'),
 ('To', 'dcek1a1@netsgo.com'),
 ('Subject', 'Life Insurance - Why Pay More?'),
 ('Date', 'Wed, 21 Aug 2002 20:31:57 -1600'),
 ('MIME-Version', '1.0'),
 ('Message-ID', '<0103c104200

In [22]:
spam_emails[0]['Subject']

'Life Insurance - Why Pay More?'

In [23]:
# Split into training and test set
X = np.array(ham_emails + spam_emails)
y = np.array([0]*len(ham_emails) + [1]*len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [24]:
def html_to_plain_text(html):
    soup = bs4.BeautifulSoup(html, 'lxml')
    text = [s.extract() for s in soup(['style', 'script', 'head'])]
    return soup.text.strip()

In [25]:
print(html_to_plain_text(spam_emails[0].get_content()))

Save up to 70% on Life Insurance.
Why Spend More Than You Have To?

Life Quote Savings










Ensuring your 
      family's financial security is very important. Life Quote Savings makes 
      buying life insurance simple and affordable. We Provide FREE Access to The 
      Very Best Companies and The Lowest Rates.





Life Quote Savings is FAST, EASY and 
            SAVES you money! Let us help you get started with the best values in 
            the country on new coverage. You can SAVE hundreds or even thousands 
            of dollars by requesting a FREE quote from Lifequote Savings. Our 
            service will take you less than 5 minutes to complete. Shop and 
            compare. SAVE up to 70% on all types of Life insurance! 



Click Here For Your 
            Free Quote!

Protecting your family is the best investment you'll ever 
          make!








If you are in receipt of this email 
      in error and/or wish to be removed from our list, PLEASE CLICK HERE AND 

In [26]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ('text/html', 'text/plain'):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [27]:
print(email_to_text(spam_emails[420]))

Generic V*AGRA -
$3 per 50mg - Generic V*AGRAOnly The Price & The Packaging is
Different Shipping & Handling - FREE NO Prescription
Required for shipping. NO Consultation
Fee.


If you do not  wish to receive mail from us please
follow this link for removal and we will ensure
that you never receive email from our system again. 
We apologise for any inconvenience we may have caused you.


In [28]:
# Stemming

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, '=>', stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [29]:
url_extractor = urlextract.URLExtract()
text = 'Will William it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s Computations Computation Computing'
ur = url_extractor.find_urls(text)
print(ur)
for u in ur:
    print(u)
    text = text.replace(u, ' URL ') 
print(text)

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']
github.com
https://youtu.be/7Pq-S557XQU?t=3m32s
Will William it detect  URL  and  URL  Computations Computation Computing


In [35]:
word_counts = Counter(text.split())
stemmed_counts = Counter()
for word, count in word_counts.items():
    stemmed_word = stemmer.stem(word)
    stemmed_counts[stemmed_word] += count
stemmed_counts.most_common()

[('comput', 3),
 ('url', 2),
 ('will', 1),
 ('william', 1),
 ('it', 1),
 ('detect', 1),
 ('and', 1)]

In [31]:
# Creating EmailToWordCounterTransformer
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers = True, lower_case = True, remove_punctuation = True, replace_urls = True,
                replace_numbers = True, stemming = True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = url_extractor.find_urls(text)
                for url in urls:
                    text = text.replace(url, ' URL ')
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [32]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom

In [54]:
for row, word_count in enumerate(X_few_wordcounts):
    for word, count in word_count.items():
        print(word, count)

chuck 1
murcko 1
wrote 1
stuff 1
yawn 1
r 1
some 1
interest 1
quot 1
url 1
thoma 1
jefferson 2
i 2
have 2
examin 1
all 3
the 11
known 1
superstit 2
of 9
word 1
and 8
do 1
not 1
find 1
in 1
our 1
particular 1
christian 3
one 2
redeem 1
featur 1
they 1
are 1
alik 1
found 1
on 2
fabl 1
mytholog 1
million 1
innoc 1
men 1
women 1
children 1
sinc 1
introduct 1
been 2
burnt 1
tortur 1
fine 1
imprison 1
what 1
ha 2
effect 1
thi 1
coercion 1
to 3
make 1
half 2
world 1
fool 1
other 1
hypocrit 1
support 1
rogueri 2
error 1
over 1
earth 1
six 1
histor 1
american 1
by 3
john 1
e 1
remsburg 1
letter 1
william 1
short 1
again 1
becom 1
most 1
pervert 1
system 1
that 1
ever 1
shone 1
man 1
absurd 1
untruth 1
were 1
perpetr 1
upon 1
teach 2
jesu 2
a 1
larg 1
band 1
dupe 1
import 1
led 1
paul 1
first 1
great 1
corrupt 1
in 2
forteana 2
y 1
martin 2
adamson 1
s 3
wrote 1
for 1
an 2
altern 1
and 2
rather 1
more 1
factual 1
base 1
rundown 1
on 1
hamza 1
career 1
includ 1
hi 1
belief 1
that 1
all 1
non 1
mu

In [52]:
stemmed_counts.most_common()

[('comput', 3),
 ('url', 2),
 ('will', 1),
 ('william', 1),
 ('it', 1),
 ('detect', 1),
 ('and', 1)]

In [63]:
trial = [('aaa', 3), ('bbb', 4), ('ccc', 6), ('aaa', 3)]

voc = {word: index + 1 for index, (word, count) in enumerate(stemmed_counts.most_common())}
print(voc)
voc.get('url', 0)

{'comput': 1, 'url': 2, 'will': 3, 'william': 4, 'it': 5, 'detect': 6, 'and': 7}


2

In [74]:
# Creating WordCounterToVectorTransformer
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size = 1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None): # creates an ordered list of most common words
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        data = []
        rows = []
        cols = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape = (len(X), self.vocabulary_size + 1))

In [75]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int32'>'
	with 20 stored elements in Compressed Sparse Row format>

In [76]:
vocab_transformer.most_common_

[('the', 10),
 ('of', 10),
 ('and', 10),
 ('url', 6),
 ('to', 6),
 ('all', 4),
 ('in', 3),
 ('christian', 3),
 ('on', 3),
 ('by', 3)]

In [77]:
vocab_transformer.vocabulary_

{'all': 6,
 'and': 3,
 'by': 10,
 'christian': 8,
 'in': 7,
 'of': 2,
 'on': 9,
 'the': 1,
 'to': 5,
 'url': 4}

In [78]:
X_few_vectors.toarray()

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  1,  3,  3,  1,  3,  2,  3],
       [65,  0,  1,  2,  5,  3,  1,  2,  0,  1,  0]], dtype=int32)

In [87]:
# Create a pipeline
preprocess_pipeline = Pipeline([
    ('email_to_wordcount', EmailToWordCounterTransformer()),
    ('count_to_vector', WordCounterToVectorTransformer())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [88]:
log_clf = LogisticRegression()
log_score = cross_val_score(log_clf, X_train_transformed, y_train, cv = 5)
log_score.mean()



0.9841666666666666

In [95]:
X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test, y_pred)))



Precision: 92.08%
Recall: 97.89%
