In [1]:
'''
A spam filter based on Paul Graham's A Plan for Spam.

@author: Sameer Mall
@professor: kvanderlinden
@version March 06, 2019
'''


spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
test1 = ["spam", "I", "do", "am", "not"]
test2 = ["green", "eggs", "ham", "and"]


good_corpus = {}
bad_corpus = {}

good_word = 0
bad_word = 0


def corpus_dictionary(corpus):
    """Returns a new corpus dictionary"""
    new_dict = {}
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            word = corpus[i][j].lower()
            if word in new_dict:
                new_dict[word] = new_dict[word] + 1
            else:
                new_dict.update({word: 1})
    return new_dict



def probability_dict(good_corpus, bad_corpus):
    """create a new dictionary where each token is mapped to a probability"""
    for i in good_corpus:
        b = 0
        new_dict = {}
        ngood = count_num_words(good_corpus)
        nbad = count_num_words(bad_corpus)
        g = 2 * good_corpus[i]
        if i in bad_corpus:
            b = bad_corpus[i]
        if g + b > 1:
            word_value = max(.01, min(.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
        new_dict[i] = word_value
    
    #go through the words in the bad corpus 
    for i in bad_corpus:
        if i not in new_dict:
            g = 0
            word_value = 0
            
            if i in good_corpus:
                g = 2 * good_corpus[i]
            b = bad_corpus[i]
            if g + b > 1:
                word_value = max(.01, min(.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
            new_dict[i] = word_value
    return new_dict


def count_num_words(corpus):
    """Count the number of words in a given corpus"""
    count = 0
    for i in corpus:
        count = count + corpus[i]
    return count


def calculate_spam_probability(prob_dict, message):
    """Calculates that the incoming message is spam or not spam"""
    product = 1
    complement_product = 1

    for i in range(len(message)):
        word = message[i]
        if word in prob_dict:
            if prob_dict[word] > 0:
                product = product * prob_dict[word]
                complement_product = complement_product * (1 - prob_dict[word])
        else:
            product = product * .4
            complement_product = complement_product * 0.6
    return product / (product + complement_product)


if __name__ == '__main__':
    good_corpus = corpus_dictionary(ham_corpus)
    bad_corpus = corpus_dictionary(spam_corpus)
    probability_dictionary = probability_dict(good_corpus, bad_corpus)
    probability_spam = calculate_spam_probability(probability_dictionary, test1)

    print(good_corpus)
    print(bad_corpus)
    print(probability_dictionary)
    print("The probability of this message being a spam message is: " + str(
        calculate_spam_probability(probability_dictionary, test1)))
    print("The probability of this message being a spam message is: " + str(
        calculate_spam_probability(probability_dictionary, test2)))

{'like': 1, 'ham': 1, 'eggs': 1, 'i': 2, 'do': 2, 'green': 1, 'and': 1}
{'spam': 2, 'like': 1, 'not': 1, 'i': 3, 'do': 1, 'am': 2, 'that': 1, 'spamiam': 1}
{'spam': 0.99, 'like': 0.27272727272727276, 'not': 0, 'and': 0.01, 'i': 0.36, 'do': 0.15789473684210525, 'am': 0.99, 'that': 0, 'spamiam': 0}
The probability of this message being a spam message is: 0.999184422469161
The probability of this message being a spam message is: 0.00298396120850429


In [2]:
'''
This module implements the Bayesian network shown in the text, Figure 14.12.
It's taken from the AIMA Python code.

@author: Sameer Mall
@professor: kvanderlinden
@version Mar 06, 2019
'''

from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

# Utility variables
T, F = True, False

# From AIMA code (probability.py) - Fig. 14.2 - burglary example
cloudy = BayesNet([
    ('Cloudy', '', 0.5),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00})
    ])

# d. Compute probabilities for the following
# i. P(Cloudy)
print(enumeration_ask('Cloudy', dict(), cloudy).show_approx())
# Output:
#       False: 0.5, True: 0.5

# ii. P(Sprinkler | cloudy)
print(enumeration_ask('Sprinkler', dict(Cloudy=T), cloudy).show_approx())
# Output:
#       False: 0.9, True: 0.1

# iii. P(Cloudy | the sprinkler is running and its not raining)
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), cloudy).show_approx())
# Output:
#       False: 0.952, True: 0.0476

# iv. P(Wet Grass | its cloudy, the sprinkler is running and its raining)
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), cloudy).show_approx())
# Output:
#       False: 0.01, True: 0.99

# v. P(Cloudy | the grass is not wet)
print(enumeration_ask('Cloudy', dict(WetGrass=F), cloudy).show_approx())
# Output:
#       False: 0.639, True: 0.361

# All of the hand worked solutions were done on paper which I put in your mailbox

False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361
