In [4]:
'''
貝氏定理 (Bayes Theory)
'''
from collections import defaultdict, Counter
import math

# 拆分單字
def tokenize(message):
    message = message.lower()                       # transmit lower
    all_words = re.findall("[a-z0-9]+", message)    # pick up words
    return set(all_words)                           # remove repeat parts

def count_words(training_set):
    '''
    訓練組資料 training_set 包含了許多兩兩成對的 (message, is_spam)
    '''
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][o if is_spam else 1] += 1
    return counts

# 計數值轉換成機率的估計值
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    '''
    把 word_counts 轉換成 tuple
    w, p(w | spam) and p(w | ~spam)
    '''
    return [(w,(spam + k) / (total_spams + 2 * k),
            (non_spam + k) / (total_non_spams + 2 * k))
           for w, (spam, non_spam) in counts.iteritems()]
    
# 單詞的相關機率值
def spam_probabilities(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    # 詞彙表中的每個單詞進行迭代操作
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        # 如果郵件中有出現相應單詞
        # 把該單詞有出現的相應機率取對數之後，再累加進去
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
            
        # 如果郵件中沒出現相應單詞
        # 就該把單詞沒出現的相應機率取對數之後，再累加進去
        # 單詞沒出現的相應機率，就等於 (1-單詞有出現的機率)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
            
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [6]:
# 整合到貝氏分類器之中
class NativeBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        # 計算垃圾郵件與非垃圾郵件的數量
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        # 運用訓練組數據，執行所設計的相關運算
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                            num_spams,
                                            num_non_spams,
                                            self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [15]:
'''
測試模型
'''
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob
# 將路徑修改為實際上你存放的位置
path = r"C:\Downloads\*\*"

def get_subject_data(path):

    data = []

    # glob.glob 會送回每一個符合通配 (wildcarded) 路徑的相應檔案名稱
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswitch("Subject:"):
                    # 移除開頭的 "Subject: " 字樣，保留其餘的部分
                    subject = re.sub(r"^Subject: ", "", line).strip()
                    data.append((subject, is_spam))
    return data

ModuleNotFoundError: No module named 'machine_learning'

In [25]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

def train_and_test_model(path):
    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    # 元組: (主旨，是否確實為垃圾郵件，預測為垃圾郵件的機率)
    classified = [(subject, is_spam, classifier.classify(subject))
                 for subject, is_spam in test_data]

    # 假設 spam_probability > 0.5 對應的是預測為垃圾郵件
    # 計算 (是否確實為垃圾郵件，是否預測為垃圾郵件) 組合的數量
    counts = Counter((is_spam, spam_probability > 0.5)
                    for _, is_spam, spam_probability in classified)
    print(counts)

    # 根據垃圾郵件機率 (spam_probability) 從最小到最大進行排序
    classified.sort(key=lambda row: row[2])

    # 在非垃圾郵件中，被預測為垃圾郵件的最高機率者
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]

    # 在垃圾郵件中，被預測為垃圾郵件的最低機率者
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

In [26]:
def p_spam_given_word(word_prob):
    '''
    運用貝氏定理計算郵件包含某單詞情況下，該郵件為垃圾郵件的機率
    '''
    # word_prob 是由 word_probabilities 所生成的其中之一個元組
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_spam / (prob_if_spam + prob_if_not_spam)
    
words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-5:]
hammiest_words = words[:5]

print("spammiest_words", spammiest_words)
print("hammiest_words", hammiest_words)

NameError: name 'classifier' is not defined

In [29]:
# 單辭轉換成同類詞
def drop_final_s(word):
    return re.sub("s$", "", word)