In [57]:
import re
from sklearn import preprocessing
from main.models import Conversation, Message, SpamCategory

def clean_up(text):
    # remove html tags or pseudo-html tags
    cleaned = re.sub(r'(<[a-z]+.*?>)|(\[http:.*?\])|(</.*?>)|([^ \n]{50,})', '{}', text)
    return cleaned

def get_classified_data():
    msgs = []
    categories = []
    conversations = Conversation.objects.filter(classified=True)
    for conversation in conversations:
        conversation_msgs = conversation.messages
        if len(conversation_msgs) > 0:
            msg = conversation.messages[0]
            msgs.append(clean_up(msg.subject + ' ' + msg.body))
            # NOTE: The targets should be integers
            # TODO: FIX THIS to make integers and create mapping dict
            categories.append(conversation.category_id)
    le = preprocessing.LabelEncoder()
    target = le.fit_transform(categories)
    result = {'data': msgs, 'target': target, 'labels': categories, 'classes': le.classes_,
              'class_names': list(map((lambda c: SpamCategory.objects.filter(id=c)[0].name), data['classes']))}
    return result



In [58]:
data = get_classified_data()

In [59]:
# msgs are the untokenized messages, labels are the categories for the messages
print("Messages: {}, Classes: {}".format(len(data['data']), len(data['classes'])))

Messages: 197, Classes: 7


In [60]:
# Better way to split training/testing data
from sklearn.model_selection import train_test_split

(x_train, x_test, y_train, y_test) = train_test_split(data['data'], data['target'], train_size=0.8)


In [61]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english", min_df=2,
                            token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')

# Learn the vocabulary
vectorizer.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b',
        tokenizer=None, vocabulary=None)

In [62]:
# examine the fitted vocabulary
len(vectorizer.get_feature_names())

1629

In [63]:
# transform training data into a 'document-term matrix'
train_dtm = vectorizer.transform(x_train)
train_dtm

<157x1629 sparse matrix of type '<class 'numpy.int64'>'
	with 8311 stored elements in Compressed Sparse Row format>

In [64]:
# check the type of the document-term matrix
type(train_dtm)

scipy.sparse.csr.csr_matrix

In [65]:
# transform testing data
test_dtm = vectorizer.transform(x_test)

In [66]:
test_dtm.shape

(40, 1629)

In [67]:
train_dtm.shape

(157, 1629)

In [68]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [69]:
%time nb.fit(train_dtm, y_train)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 2.93 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [70]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(test_dtm)

In [71]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.72499999999999998

In [72]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 1,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  1,  0,  0,  0],
       [ 2,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  1,  3,  0,  7,  0],
       [ 0,  0,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0, 18,  0],
       [ 0,  0,  0,  0,  0,  0,  3]])

In [73]:
y_pred_prob = nb.predict_proba(test_dtm)[:, 1]

In [74]:
y_test

array([5, 5, 4, 3, 5, 5, 5, 3, 5, 5, 3, 3, 5, 3, 3, 1, 3, 3, 5, 5, 5, 3, 5,
       2, 0, 6, 5, 3, 5, 5, 5, 2, 4, 6, 2, 5, 5, 6, 3, 1])

# Part 7. Examining a model

In [75]:
# store the vocabulary of X_train
X_train_tokens = vectorizer.get_feature_names()
len(X_train_tokens)

1629

In [76]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['abandoned', 'abidjan', 'able', 'abroad', 'absolutely', 'abuja', 'abuse', 'accept', 'acceptable', 'access', 'accomplish', 'according', 'accordingly', 'account', 'accountant', 'accounts', 'accurate', 'acquaintance', 'act', 'action', 'active', 'activities', 'activity', 'actual', 'ad', 'add', 'added', 'addition', 'address', 'addressed', 'addressee', 'addresses', 'advance', 'advantage', 'adventurous', 'advertisement', 'advertisements', 'advice', 'advise', 'advised', 'affordable', 'afraid', 'africa', 'african', 'age', 'agencies', 'agency', 'agent', 'ago', 'agreed']


In [77]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['website', 'websites', 'week', 'weekly', 'weeks', 'weight', 'west', 'western', 'whatsapp', 'wicked', 'widow', 'widows', 'wife', 'williams', 'willing', 'willingness', 'winner', 'winning', 'wire', 'wish', 'wishes', 'withdraw', 'woman', 'women', 'won', 'wordpress', 'words', 'work', 'worked', 'working', 'works', 'world', 'worldwide', 'worry', 'worth', 'write', 'writing', 'wrong', 'wrote', 'www', 'xia', 'yahoo', 'yandex', 'year', 'years', 'yes', 'york', 'youtube', 'zip', 'zone']


In [78]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  2.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 2.,  3.,  5., ...,  1.,  2.,  1.],
       [ 0.,  0.,  2., ...,  0.,  0.,  0.]])

In [79]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(7, 1629)

In [80]:
# Most informative features
import numpy as np
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))


In [81]:
print_top10(vectorizer, nb, data['class_names'])


Website: regards interested email computer services com web website design development
Medical: click offers weeks designed life blood health free weight low
Sales: registration apps app time com email information free mail mobile
Click bait: download simple report insurance http free account unsubscribe domain click
Business: investment money work account resources com business company capital email
Money scam: account payment address funds transfer email fund contact money bank
Dating: nice women like meet don man married looking hope want
