<a href="https://colab.research.google.com/github/quotation3/TIL/blob/master/NBC_%EC%9D%BC%EB%B0%98%ED%99%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd
import numpy as np
import math

In [66]:
class NBC:

    def __init__(self,text, text_type, words, k =0.5):
        self.text = text
        self.words = words
        self.text_type = text_type
        self.types = []
        self.tokens = []
        self.p_type1 = []
        self.p_type2 = []
        self.log_p_type1 = []
        self.log_p_type2 = []
        self.type1_cnt_list = []
        self.type2_cnt_list = []
        self.text_split = []
        self.k = k

# 토큰화함수
    def make_tokens(self):

        for i in range(len(self.text)):
            self.text_split.append(' '.join(self.text[i]).split(' '))

        self.tokens = []
        for i in range(len(self.text_split)):
            self.tokens.extend(self.text_split[i])
        self.tokens = list(set(self.tokens))

        return self.tokens

# text_type별 카운트 함수   
    def cnt_by_type(self):

        for i in range(len(self.text_type)):
            self.types.append(self.text_type[i])
        self.types = list(set(self.types))
        self.type1 = self.types[0]  # 첫번째 감정 -> type1
        self.type2 = self.types[1]  # 두번째 감정 -> type2

        for i in range(len(self.tokens)):
            type1_cnt = 0
            type2_cnt = 0
            for j in range(len(self.text_split)):    # tokens의 단어가 text_split[j]에 있으면 그 빈도 만큼 카운트 추가
                if self.tokens[i] in self.text_split[j]:
                    if self.text_type[j] == self.type1:
                        type1_cnt+=self.text_split[j].count(self.tokens[i])
                    elif self.text_type[j] == self.type2:
                        type2_cnt+=self.text_split[j].count(self.tokens[i])
                    else:
                        pass
            self.type1_cnt_list.append(type1_cnt)
            self.type2_cnt_list.append(type2_cnt)
        
        cnt_list = np.array([self.tokens,self.type1_cnt_list,self.type2_cnt_list])
        cnt_table = pd.DataFrame(cnt_list.T, columns=['단어',self.type1, self.type2])

        return cnt_table

# 전체 테이블 만드는 함수
    def table(self):

        for i in range(len(self.tokens)):
            prob_type1 = (self.k+float(str(self.type1_cnt_list[i])))/(2*self.k+sum(self.type1_cnt_list))*100
            prob_type2 = (self.k+float(str(self.type2_cnt_list[i])))/(2*self.k+sum(self.type2_cnt_list))*100
            self.p_type1.append("%0.2f" %prob_type1)
            self.p_type2.append("%0.2f" %prob_type2)

        for i in range(len(self.tokens)):
            log_prob_type1 = np.log((self.k+float(str(self.type1_cnt_list[i])))/(2*self.k+sum(self.type1_cnt_list)))
            log_prob_type2 = np.log((self.k+float(str(self.type2_cnt_list[i])))/(2*self.k+sum(self.type2_cnt_list)))
            self.log_p_type1.append("%0.4f"%log_prob_type1)
            self.log_p_type2.append("%0.4f"%log_prob_type2)

        total_list = np.array([self.tokens,self.type1_cnt_list,self.type2_cnt_list,self.p_type1,self.p_type2,self.log_p_type1,self.log_p_type2])
        total_table = pd.DataFrame(total_list.T, columns=['단어',self.type1, self.type2,'P(w|{})'.format(self.type1),'P(w|{})'.format(self.type2),'Log(P(w|{}))'.format(self.type1),'Log(P(w|{}))'.format(self.type2)])

        return total_table
    

    def type_filtering (self):

        index_a = self.tokens.index(self.words[0])  # 키워드가 tokens리스트에서 갖는 인덱스 확인
        index_b = self.tokens.index(self.words[1])

        self.type1_prob = np.log(self.text_type.count(self.type1)/len(self.text_type))
        self.type2_prob = np.log(self.text_type.count(self.type2)/len(self.text_type))

        x1 = math.exp(float(self.log_p_type2[index_a])+(float(self.log_p_type2[index_b]))+(float("%0.4f"%self.type2_prob)))
        x2 = math.exp(float(self.log_p_type1[index_a])+(float(self.log_p_type1[index_b]))+(float("%0.4f"%self.type1_prob)))

        return ('{},{}가 포함됐을때 {}일 확률 : {}'.format(self.words[0],self.words[1],self.type1, (x2/(x1+x2))*100), '{},{}가 포함됐을때 {}일 확률 : {}'.format(self.words[0],self.words[1],self.type2, (x1/(x1+x2))*100))

In [67]:
text = [['me free lottery'],['free get free you'],['you free scholarship'],['free to contact me'],['you won award'],['you ticket lottery']]
text_type = ['spam','spam','normal','normal','normal','spam']
words = ['free','lottery']
nbc = NBC(text, text_type, words)

In [68]:
nbc.make_tokens()                                                                              

['lottery',
 'get',
 'to',
 'scholarship',
 'contact',
 'you',
 'free',
 'ticket',
 'me',
 'won',
 'award']

In [69]:
nbc.cnt_by_type()

Unnamed: 0,단어,spam,normal
0,lottery,2,0
1,get,1,0
2,to,0,1
3,scholarship,0,1
4,contact,0,1
5,you,2,2
6,free,3,2
7,ticket,1,0
8,me,1,1
9,won,0,1


In [70]:
nbc.table()

Unnamed: 0,단어,spam,normal,P(w|spam),P(w|normal),Log(P(w|spam)),Log(P(w|normal))
0,lottery,2,0,22.73,4.55,-1.4816,-3.091
1,get,1,0,13.64,4.55,-1.9924,-3.091
2,to,0,1,4.55,13.64,-3.091,-1.9924
3,scholarship,0,1,4.55,13.64,-3.091,-1.9924
4,contact,0,1,4.55,13.64,-3.091,-1.9924
5,you,2,2,22.73,22.73,-1.4816,-1.4816
6,free,3,2,31.82,22.73,-1.1451,-1.4816
7,ticket,1,0,13.64,4.55,-1.9924,-3.091
8,me,1,1,13.64,13.64,-1.9924,-1.9924
9,won,0,1,4.55,13.64,-3.091,-1.9924


In [71]:
nbc.type_filtering()

('free,lottery가 포함됐을때 spam일 확률 : 87.49988899428504',
 'free,lottery가 포함됐을때 normal일 확률 : 12.500111005714952')