In [41]:
from collections import Counter
import math
train_data = [
    ("Positive", ["love", "movie", "good"]),
    ("Positive", ["good", "movie"]),
    ("Positive", ["love", "love", "movie"]),
    ("Positive", ["love", "good"]),

    ("Negative", ["bad", "movie"]),
    ("Negative", ["hate", "bad"]),
    ("Negative", ["bad", "bad", "movie"]),
    ("Negative", ["hate", "movie"]),
    ("Negative", ["bad"]),
    ("Negative", ["hate", "bad", "movie"]),
]

In [42]:
vocab = ["love", "good", "movie", "bad", "hate"]

In [54]:
#step 1. prior 

labels =[label for label, doc in train_data]
label_c = Counter(labels)
total_docs = len(train_data)

priors = {label: label_c[label] / total_docs for label in label_c}

In [55]:
print(label_c)

Counter({'Negative': 6, 'Positive': 4})


In [56]:
print(total_docs)

10


In [57]:
#train_data 돌면서 분자/분모 채우기

In [58]:
from collections import Counter

word_counts = {
    "Positive": Counter(),
    "Negative": Counter()
}

total_words = {
    "Positive": 0,
    "Negative": 0
}

In [59]:
#분모, 분자 채우기

for label, docs in train_data:
    for w in docs:
        word_counts[label][w] += 1
        total_words[label] += 1

print(word_counts)
print(total_words)

{'Positive': Counter({'love': 4, 'movie': 3, 'good': 3}), 'Negative': Counter({'bad': 6, 'movie': 4, 'hate': 3})}
{'Positive': 10, 'Negative': 13}


In [60]:
# 조건부 
vocab = {w for label, docs in train_data for w in docs}
vocab_size = len(vocab)

In [61]:
print("vocab_ size: ", vocab_size)

vocab_ size:  5


In [62]:
#prediction with laplace smoothing + log

In [63]:
import math

def predict_nb(test_words):
    scores_raw = {}      # 스무딩 X
    scores_smooth = {}   # 스무딩 O (Laplace, alpha=1)

    for label in priors:
        # prior 먼저 log에 넣기
        log_raw = math.log(priors[label])
        log_smooth = math.log(priors[label])

        for w in test_words:
            # count는 Counter라서, 없으면 0 자동
            c = word_counts[label][w]

            # 1) 스무딩 없는 경우
            #    c == 0이면 log(0)이라서 -inf 처리 (그 label은 거의 0점수라고 보면 됨)
            if c == 0:
                log_raw = float("-inf")
            else:
                prob_raw = c / total_words[label]
                log_raw += math.log(prob_raw)

            # 2) Laplace smoothing (alpha = 1)
            prob_s = (c + 1) / (total_words[label] + vocab_size)
            log_smooth += math.log(prob_s)

        scores_raw[label] = log_raw
        scores_smooth[label] = log_smooth

    # label 선택
    pred_raw = max(scores_raw, key=scores_raw.get)
    pred_smooth = max(scores_smooth, key=scores_smooth.get)

    return (pred_raw, scores_raw), (pred_smooth, scores_smooth)

In [64]:
(raw_result, smooth_result) = predict_nb(["love", "movie"])

print("no smoothing:", raw_result)
print("with Laplace:", smooth_result)


no smoothing: ('Positive', {'Positive': -3.036554268074246, 'Negative': -inf})
with Laplace: ('Positive', {'Positive': -3.336658860524584, 'Negative': -4.68213122712422})
