# BYO Spam Classifier (pp. 103)
## 1. Split datasets into test and training sets

In [1]:
import os
# 
os.listdir()
if os.path.exists('datasets/'):
    os.chdir("datasets")

In [2]:
import mailbox as mb
from mailbox import Message


ham = []
spam = []

for fn in os.listdir('easy_ham'):
    with open('easy_ham/' + fn,'rb') as file:
        ham.append(Message(file.read()))

for fn in os.listdir('spam'):
    with open('spam/' + fn,'rb') as file:
        spam.append(Message(file.read()))


In [9]:
print(len(ham))
print(len(spam))

2551
501


## 2. Process data
A. Turn the messages into vectors ([see docs](http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage))

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## 3. Write a data prep pipeline to convert emails into feature vectors


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class EmailToString(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [str(email.values()) for email in X]
    
text_pipeline = Pipeline([
    ('stringify', EmailToString()),
    ('vectorizer',CountVectorizer(min_df=1))
])

y_ham = np.ndarray([1, len(ham)-1000]) # + np.ndarray([1, len(train_spam)]).fill(False)
y_ham.fill(True)
y_spam = np.ndarray([1, len(spam)-200])
y_spam.fill(True)

y_ham = np.ndarray([1, len(ham)-1000]) # + np.ndarray([1, len(train_spam)]).fill(False)
y_ham.fill(True)
y_spam = np.ndarray([1, len(spam)-200])
y_spam.fill(True)

y_train = np.concatenate((y_ham, y_spam), axis=1)

text_prepared_train = text_pipeline.fit_transform(ham[:-1000] + spam[:-200])
text_prepared_test = text_pipeline.fit_transform(ham[-1000:] + spam[-200:])

X_train = text_prepared_train


In [7]:
print(y_ham.shape)
print(y_spam.shape)
print(y.shape)

(1, 1551)
(1, 301)
(1, 1852)


In [18]:
print(X.shape)
print(y.shape)

(1852, 21997)
(1, 1852)


In [22]:
# Credit to Saullo Castro for shuffle_in_unison()

def shuffle_in_unison_scary(a, b):
    rng_state = numpy.random.get_state()
    numpy.random.shuffle(a)
    numpy.random.set_state(rng_state)
    numpy.random.shuffle(b)

X_shuf, y_shuff = shuffle_in_unison(X, y)


ValueError: setting an array element with a sequence.

## 4. Add hyperparameters to prep pipeline

In [None]:
text_pipeline = Pipeline([
    ('stringify', EmailToString()),
    ('vectorizer',CountVectorizer(min_df=1))
])

## 5. Try using different classifiers 