In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import email
import numpy as np
import scipy as sp
import pandas as pd
import itertools as it
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from nltk.corpus import stopwords
from tools import cat_parser

from matplotlib import pyplot as plt
import seaborn as sns

Find the list of data files from the data directory

In [3]:
data_path = './data'
email_paths = [
    os.path.join(root, filename) for root, _, filenames in os.walk(data_path)
        for filename in filenames
        if filename.endswith('.txt')
][1:] # first file found is the categories.txt file detailing the categories
category_paths = map(lambda s: str.replace(s, '.txt', '.cats'), email_paths)

Set some constants that will be used for memory allocation and parameterizing training

In [4]:
n_samples = len(email_paths)
n_categories = 8
n_features = 5000

Read in the email data and remove punctuation and stopwords from the email bodies

In [5]:
stops = set(stopwords.words('english'))
def clean_email_body(body):
    punctuation_removed = re.sub(
        "[^a-zA-Z]",  
        " ",  
        body 
    )
    all_lowercase = punctuation_removed.lower()
    words = all_lowercase.split()
    meaningful_words = [word for word in words if word not in stops]
    return " ".join(meaningful_words)

In [6]:
email_files = it.imap(open, email_paths)
email_msgs = it.imap(email.message_from_file, email_files)
email_bodies = (msg.get_payload() for msg in email_msgs)
email_bodies_clean = map(clean_email_body, email_bodies)

In [7]:
vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=n_features
)

In [8]:
# vectorizer = TfidfVectorizer(
#     analyzer='word',
#     ngram_range=(1, 1),
#     stop_words=None,
#     max_features=n_features
# )

In [9]:
email_body_features = vectorizer.fit_transform(email_bodies_clean)

In [12]:
def categories_as_vector(cata_dict_list):
    return np.asarray([cata[1][0] for cata in categories])

In [13]:
categories = map(cat_parser.parse_file, category_paths)
cat_vec = categories_as_vector(categories)

In [16]:
# for now, we're only going to assign a single category to each email, this is hard enough as it is
#  hence, cata[1][0]
cat_vec = np.asarray([cata[1][0] for cata in categories])

In [43]:
target_class = 1
cat_vec_targeted = cat_vec == target_class

In [44]:
feat_train, feat_test, cat_train, cat_test = train_test_split(
    email_body_features, 
    cat_vec_targeted, 
    test_size=0.2,
    random_state=1337
)

In [45]:
model_log_reg = LogisticRegression(
    class_weight='balanced', 
    solver='lbfgs',
    max_iter=500
)
model_log_reg.fit(feat_train, cat_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [46]:
print model_log_reg.score(feat_train, cat_train)
print model_log_reg.score(feat_test, cat_test)

0.983100661278
0.791788856305


In [47]:
print roc_auc_score(cat_test, model_log_reg.predict(feat_test))

0.786001386001
