## 13.3. 구현하기

In [66]:
#from __future__ import division
from collections import Counter, defaultdict
#from machine_learning import split_data
import math, random, re, glob

In [67]:
def tokenize(message):
    message = message.lower()                       # 소문자로 변환
    all_words = re.findall("[a-z0-9']+", message)   # 단어 추출
    return set(all_words)                           # 중복되는 단어 제거

In [68]:
def count_words(training_set):
    '''학습 데이터는 (메시지 네용, 스팸 여부) 형식으로 구성되어 있음'''
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [69]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    '''단어의 빈도수를
    [단어, p(w|스팸), p(w|~스팸)] 형태로 변환'''
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [70]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    # 모든 단어에 대해 반복
    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # 만약에 메시지에 word가 나타나면
        # 해당 단어가 나올 log 확률을 더해 줌
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # 만약 메시지에 word가 나타나지 않는다면
        # 해당 단어가 나오지 않을 log 확률을 더해 줌
        # 나오지 않을 확률은 log(1 - 나올 확률)로 계산
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
            
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [71]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):
    
        # 스팸 메시지와 스팸이 아닌 메시지의 개수를 세어 줌
        num_spams = len([is_spam 
                         for message, is_spam in training_set 
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # 지금까지 만든 함수에 학습 데이터를 적용
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, 
                                             num_spams, 
                                             num_non_spams,
                                             self.k)
                                             
    def classify(self, message):
        return spam_probability(self.word_probs, message)

## 13.4. 모델 검증하기

In [72]:
def get_subject_data(path):
    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob는 주어진 경로에 해당하는 모든 파일 이름을 반환
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r', encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    # 'Subject : '부분을 제거하고 나머지 부분을 유지
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))
                    # 폴더에서 하나의 파일의 제목을 subject에 그 파일의 스팸 여부를 data에 저장

    return data

In [73]:
data = get_subject_data(r'/Users/nani/Desktop/GitHub/data-science-from-scratch/spam_mail/*/*')
data[:5]

[('Friend, Copy ANY DVD or Playstation Game with this software......', True),
 ('5% Guaranteed for Eight Years', True),
 ('Congratulations! You Get a Free Handheld Organizer!', True),
 ('One of a kind Money maker! Try it for free!', True),
 ('Online Doctors will fill your Viagra Prescription Now!!!                QEEB',
  True)]

In [74]:
def split_data(data, prob):
    '''split data into fractions [prob, 1 - prob]'''
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [75]:
train_data, test_data = split_data(data, 0.75)  
train_data[:5], test_data[:5]

([('Friend, Copy ANY DVD or Playstation Game with this software......', True),
  ('5% Guaranteed for Eight Years', True),
  ('Congratulations! You Get a Free Handheld Organizer!', True),
  ('One of a kind Money maker! Try it for free!', True),
  ('Online Doctors will fill your Viagra Prescription Now!!!                QEEB',
   True)],
 [('Find Peace, Harmony, Tranquility, And Happiness Right Now!', True),
  ('Get the Computer Skills you need - Free', True),
  ('[WM] CEVIRI YAZILIMLARI', True),
  ('Garden Ornaments | ppu', True),
  ('A marketplace where lenders compete for your business LFHLXHU', True)])

In [76]:
tokenize("turn the word_counts into a list of triplets")

{'a', 'counts', 'into', 'list', 'of', 'the', 'triplets', 'turn', 'word'}

In [77]:
count_words(train_data[:5])

defaultdict(<function __main__.count_words.<locals>.<lambda>()>,
            {'any': [1, 0],
             'playstation': [1, 0],
             'or': [1, 0],
             'friend': [1, 0],
             'copy': [1, 0],
             'with': [1, 0],
             'dvd': [1, 0],
             'this': [1, 0],
             'software': [1, 0],
             'game': [1, 0],
             'eight': [1, 0],
             '5': [1, 0],
             'for': [2, 0],
             'guaranteed': [1, 0],
             'years': [1, 0],
             'you': [1, 0],
             'congratulations': [1, 0],
             'handheld': [1, 0],
             'organizer': [1, 0],
             'get': [1, 0],
             'free': [2, 0],
             'a': [2, 0],
             'maker': [1, 0],
             'one': [1, 0],
             'of': [1, 0],
             'it': [1, 0],
             'money': [1, 0],
             'kind': [1, 0],
             'try': [1, 0],
             'prescription': [1, 0],
             'will': [1, 0],
    

In [78]:
word_probabilities(
    count_words(train_data), len(
        [is_spam for message, is_spam in train_data if is_spam]), 
    1-len([is_spam for message, is_spam in train_data if is_spam]))[:5]

[('any', 0.011811023622047244, -0.003968253968253968),
 ('playstation', 0.006561679790026247, -0.0013227513227513227),
 ('or', 0.009186351706036745, -0.056878306878306875),
 ('friend', 0.006561679790026247, -0.006613756613756613),
 ('copy', 0.006561679790026247, -0.006613756613756613)]

In [79]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [80]:
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # 예시와 동일한 결과를 얻기 위해서 설정
    train_data, test_data = split_data(data, 0.75)    

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)    

    # [제목, 실제 스팸 여부, 예측된 스팸 확률]
    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    # 메시지가 스팸일 확률이 0.5보다 크면 스팸이라고 하자
    # 그리고 예측된 스팸 메시지가 실제 스팸인 경우를 세어 보자
    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    # 스팸일 확률을 오름차순으로 정렬
    classified.sort(key=lambda row: row[2])
    
    # 스팸이 아닌 메시지 중에서 스팸일 확률이 가장 높은 메시지
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    
    # 스팸 중에서 스팸일 확률이 가장 낮은 메시지
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:] 
    hammiest_words = words[:5] 
 

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

In [81]:
if __name__ == "__main__":
    train_and_test_model(r'/Users/nani/Desktop/GitHub/data-science-from-scratch/spam_mail/*/*')

Counter({(False, False): 716, (True, True): 85, (True, False): 49, (False, True): 26})
spammiest_hams [('Species at risk of extinction growing', False, 0.8958889624800302), ('Cell phones coming soon', False, 0.9666801692557614), ('Adam dont job for no one, see.', False, 0.9758486261566025), ('2000+ year old Greek computer reinterpreted', False, 0.9767939458812925), ('Save up to 70% on international calls!', False, 0.9776715683050722)]
hammiest_spams [('I was so scared... my very first DP', True, 5.4062616415727244e-05), ('Re: Hi', True, 0.0009722322165778127), ('*****SPAM*****', True, 0.0021267760018624494), ('http://www.efi.ie/', True, 0.007758971914390464), ('Outstanding Opportunities for "Premier Producers"', True, 0.008355413144240686)]
spammiest_words [('zzzz', 0.02837837837837838, 0.0002294630564479119), ('money', 0.033783783783783786, 0.0002294630564479119), ('rates', 0.033783783783783786, 0.0002294630564479119), ('systemworks', 0.033783783783783786, 0.0002294630564479119), ('ad

## 13.5. 더 공부해 보고 싶다면
- 스팸 필터에 대해 더 자세히 알아보고 싶다면 폴 그레이엄(Paul Graham)의 A Plan for Spam과 Better Bayesian Filtering을 참고하자.
- scikit-learn은 이번 장에서 구현한 나이브 베이즈와 똑같이 동작하는 BernoulliNB 모델을 비롯해 다양한 종류의 모델을 제공하고 있다.