In [27]:
'''
This module implements a spam filter using the provided
spam_corpus and ham_corpus.

@author: Sean Brouwer
@version March 10, 2019
'''


spam_corpus = [["I", "am", "spam", "spam", "I", "am"],
               ["I", "do", "not", "like", "that", "spamiam"]]
not_spam_corpus = [["do", "i", "like", "green", "eggs",
                    "and", "ham"], ["i", "do"]]


# This function calculates the probability of a word being
# spam based on spam and not_spam lists
def prob_spam(word, spam, not_spam):
    # Count not_spam word occurrences * 2
    g = 0
    for I in range(len(not_spam)):
        for J in range(len(not_spam[I])):
            if word == not_spam[I][J]:
                g += 2
    # count spam word occurrences
    b = 0
    for I in range(len(spam)):
        for J in range(len(spam[I])):
            if word == spam[I][J]:
                b += 1
    if g+b > 0:
        # Calculate probability of spam
        return max(0.01, min(0.99, min(1.0, b/len(spam)) /
                             (min(1.0, g/len(not_spam)) +
                              min(1.0, b/len(spam)))))
    else:
        return 0
    

# This function returns the combined probability of a list
# of individual probabilities
def calc_combined_prob(probs):
    prod = 1
    probs_comp = 1
    for I in range(len(probs)):
        prod *= probs[I]
        probs_comp *= (1-probs[I])
    return prod / (prod + probs_comp)


# Generate list of test words
testWords = []
for i in range(len(spam_corpus)):
    for j in range(len(spam_corpus[i])):
        if spam_corpus[i][j].lower() not in testWords:
            testWords.append(spam_corpus[i][j].lower())
for i in range(len(not_spam_corpus)):
    for j in range(len(not_spam_corpus[i])):
        if not_spam_corpus[i][j].lower() not in testWords:
            testWords.append(not_spam_corpus[i][j].lower())

# Calculate probabilities for list of words and print
print("Below are listed the probabilities of words being\n\
part of a spam message:")
for i in range(len(testWords)):
    print("    '" + testWords[i] + "': " +
          str(prob_spam(testWords[i], spam_corpus,
                        not_spam_corpus)))

# Test probability of message being spam
print("\nBelow is shown the probability of a sample message\n\
being spam:\nMessage:")
sampleMsg = ["i", "am", "like", "spam"]
msg = "' "
for item in sampleMsg:
    msg += item + " "
msg += "'"
print("Message:  " + msg)
probabilities = []
for i in range(len(sampleMsg)):
    probabilities.append(prob_spam(sampleMsg[i],
                                   spam_corpus,
                                   not_spam_corpus))
print("Probability of spam: " +
      str(calc_combined_prob(probabilities)))


Below are listed the probabilities of words being
part of a spam message:
    'i': 0.01
    'am': 0.99
    'spam': 0.99
    'do': 0.3333333333333333
    'not': 0.99
    'like': 0.3333333333333333
    'that': 0.99
    'spamiam': 0.99
    'green': 0.01
    'eggs': 0.01
    'and': 0.01
    'ham': 0.01

Below is shown the probability of a sample message
being spam:
Message:
Message:  ' i am like spam '
Probability of spam: 0.9801980198019802


In [1]:
'''
This module implements the Bayesian network provided in the
problem definition, using AIMA Python Tools.

@author: Sean Brouwer
@version March 8, 2019
'''

from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

# BayesNet implementation of the cloudy problem
cloud = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.9,
                                    (F, T): 0.9, (F, F): 0})
    ])

print("Part a:\n\
  In the code above is implemented the cloudy problem\n\
  using AIMA Python tools")

print("\nPart b:\n\
  The number of independent values in the full joint\n\
  probability distribution is calculated and shown below.")
numVariables = 4  # Cloudy, Sprinkler, Rain, WetGrass
possibleOutcomesPerVariable = 2  # True, False
independentValues = possibleOutcomesPerVariable**numVariables
print("  " + str(independentValues))

print("\nPart c:\n\
  The number of independent values in the Bayesian network\n\
  for this domain is equal to the number of given\n\
  probabilities in the problem definition, which is 9.")

print("\nPart d:\n\
  P(Cloudy):\n\
    This probability is defined in the problem statement.\n\
    P(Cloudy) = <0.5, 0.5>")
  
print("\n  P(Sprinkler | cloudy):\n\
    This probability is also defined in the problem\n\
    statement.\n\
    P(Sprinkler) | cloudy) = <0.9, 0.1>")

print("\n  P(Cloudy | sprinkler ^ -rain):\n\
    Computer-generated solution:")
print("      " + enumeration_ask('Cloudy',
                                 dict(Sprinkler=T, Rain=F),
                                 cloud).show_approx())
print("    Hand calculations:\n\
      P(C|s^-r) = alpha*<P(-c)*P(s|-c)*P(-r|-c),\n\
              P(c)*P(s|c)*P(-r|c)>\n\
          = alpha*<0.5*0.5*0.8, 0.5*0.1*0.2>\n\
          = alpha*<0.2, 0.01>\n\
          = <0.952, 0.0476>")

print("\n  P(WetGrass | cloudy ^ sprinkler ^ rain):\n\
    The information about it being cloudy does not matter\n\
    as the sprinkler and rain are already given. When the\n\
    cloudy component is removed from the formulation, this\n\
    becomes another probability defined by the problem\n\
    statement.\n\
    P(WetGrass | cloudy ^ sprinkler ^ rain) = <0.01, 0.99>")

print("\n  P(Cloudy | -wet_grass):\n\
    Computer-generated solution:")
print("      " + enumeration_ask('Cloudy', dict(WetGrass=F),
                                 cloud).show_approx())
print("    Hand calculations:\n\
      P(C|-wg) = alpha*<P(-c)*P(s|-c)*P(r|-c)*P(-wg|s^r) +\n\
              P(-c)*P(s|-c)*P(-r|-c)*P(-wg|s^-r) +\n\
              P(-c)*P(-s|-c)*P(r|-c)*P(-wg|-s^r) +\n\
              P(-c)*P(-s|-c)*P(-r|-c)*P(-wg|-s^-r),\n\
              P(c)*P(s|c)*P(r|c)*P(-wg|s^r) +\n\
              P(c)*P(s|c)*P(-r|c)*P(-wg|s^-r) +\n\
              P(c)*P(-s|c)*P(r|c)*P(-wg|-s^r) +\n\
              P(c)*P(-s|c)*P(-r|c)*P(-wg|-s^-r)>\n\
          = alpha*<0.5*0.5*0.2*0.01 + 0.5*0.5*0.8*0.1 +\n\
              0.5*0.5*0.2*0.1 + 0.5*0.5*0.8*1,\n\
              0.5*0.1*0.8*0.01 + 0.5*0.1*0.2*0.1 +\n\
              0.5*0.9*0.8*0.1 + 0.5*0.9*0.2*1>\n\
          = alpha*<0.0005 + 0.02 + 0.005 + 0.2,\n\
              0.0004 + 0.001 + 0.036 + 0.09>\n\
          = alpha*<0.2255, 0.1274>\n\
          = <0.639, 0.361>")


Part a:
  In the code above is implemented the cloudy problem
  using AIMA Python tools

Part b:
  The number of independent values in the full joint
  probability distribution is calculated and shown below.
  16

Part c:
  The number of independent values in the Bayesian network
  for this domain is equal to the number of given
  probabilities in the problem definition, which is 9.

Part d:
  P(Cloudy):
    This probability is defined in the problem statement.
    P(Cloudy) = <0.5, 0.5>

  P(Sprinkler | cloudy):
    This probability is also defined in the problem
    statement.
    P(Sprinkler) | cloudy) = <0.9, 0.1>

  P(Cloudy | sprinkler ^ -rain):
    Computer-generated solution:
      False: 0.952, True: 0.0476
    Hand calculations:
      P(C|s^-r) = alpha*<P(-c)*P(s|-c)*P(-r|-c),
              P(c)*P(s|c)*P(-r|c)>
          = alpha*<0.5*0.5*0.8, 0.5*0.1*0.2>
          = alpha*<0.2, 0.01>
          = <0.952, 0.0476>

  P(WetGrass | cloudy ^ sprinkler ^ rain):
    The information 