# Naive Bayes
*It is well for the heart to be naive and for the mind not to be. -Anatole France*

In [0]:
%%capture
# To suppress the output when calling other files
%run kNearestNeighbours.ipynb

### A More Sophesticated Spam Filter
In practice we want to avoid multiplying lots of probabilities together, to avoid a problem called *underflow*, in which computers don't deal well with floating point numbers that are too close to zero. Recalling from Algebra that *log(ab) = log a + log b* and that *exp(log x) = x*, we usually compute floating-point-friendlier: <br/>
**exp(log(*p1*) + log(*p2*) + ... + log(*pn*))**

In [0]:
def tokenize(message):
    """Extracts "words" consisting of letters, numbers and apostrophes. Returns "set" of words as output
    Bag of words model"""
    message    = message.lower()                      # convert to lower case
    all_words  = re.findall("[a-z0-9']+", message)    # extract the words
    return set(all_words)                             # set return only unique words in the message


def count_words(training_set):
    """Count the words in labeled training set of messages and return a dictionary whose keys are words,
    and whose values are two-element lists [spam_count, non_spam_count] 
    corresponding to how many times we saw that word in both spam and non-spam messages.
    Training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda:[0, 0])
    for message, is_spam in training_set: # message will contain each word only once as it is a set
        for word in tokenize(message):
            if is_spam:
                counts[word][0] += 1
            else:
                counts[word][1] += 1
    return counts


def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """convert the counts into estimated probabilities, using the smoothing.
    Smoothing assumes that:
    p(word_i/Spam) = (k + number of spams containing word_i)/(2k + number of spams)
    i.e. when computing spam probabilities for ith word, we assume we also saw k additional spams containing
    the word and k additional spams not containing the word.
    Return a list of triplets w, p(w|spam) and p(w|~spam)"""
    return [(w,
             (spam+k)/(2*k+total_spams),
            (non_spam+k)/(2*k+total_non_spams))
             for w, (spam, non_spam) in counts.items()]


def spam_probability(word_probs, message):
    message_words          = tokenize(message)
    log_prob_message_given_spam = log_prob_message_given_not_spam = 0.0 
    # these will be sum of log probabilities of individual words given they appear in spam or not spam
    
    # Iterate through each word in the training set
    for word, prob_wi_given_spam, prob_wi_given_not_spam in word_probs:
        if word in message_words:
            # if *word* appears in message, add the log probability of seeing it
            log_prob_message_given_spam     += math.log(prob_wi_given_spam)
            log_prob_message_given_not_spam += math.log(prob_wi_given_not_spam)
        else:
            # if *word* does not appear in message, add the probability of not seeing it
            log_prob_message_given_spam     += math.log(1.0-prob_wi_given_spam)
            log_prob_message_given_not_spam += math.log(1.0-prob_wi_given_not_spam)
            
    prob_message_given_spam      = math.exp(log_prob_message_given_spam)
    prob_message_given_not_spam  = math.exp(log_prob_message_given_not_spam)
    
    # Assuming p(spam) = p(not_spam) = 0.5
    prob_spam_given_message      = prob_message_given_spam/(prob_message_given_spam + prob_message_given_not_spam)
    return prob_spam_given_message


class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        """Training set consists of pairs (message, is_spam)"""
        # Count spam and non_spam
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len(training_set) - num_spam
        
        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
        
    def classify(self, message):
        return spam_probability(self.word_probs, message)
    