### 나이브 베이즈(Navie Bayes)
- 확률 기반 머신러닝 분류 알고리즘 대표격


##### 가우시안 나이브 베이즈를 이용한 붓꽃 분류

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
np.random.seed(5)

In [2]:
df = pd.read_csv("../Data/iris.csv")
df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
# Feature 와 Target
data = df.loc[:,'SepalLength':'PetalWidth']
label = df.loc[:, 'Name']

In [4]:
# Train과 Test
train_data, test_data, train_target, test_target =\
                                    train_test_split(
                                        data,
                                        label,
                                        test_size=0.2,
                                        stratify=label
                                    )

In [5]:
# Gaussian Naive Bayes 분류
gn = GaussianNB()
gn.fit(train_data, train_target)
pred = gn.predict(test_data)

In [6]:
metrics.accuracy_score(test_target, pred)

0.9666666666666667

In [7]:
test_target.value_counts()

Name
Iris-setosa        10
Iris-virginica     10
Iris-versicolor    10
Name: count, dtype: int64

In [8]:
metrics.confusion_matrix(test_target, pred)

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  1,  9]], dtype=int64)

---
##### 베르누이 나이브베이즈를 활용한 스팸 분류

In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
df = pd.read_csv("../Data/email_train.csv")
df

Unnamed: 0,email title,spam
0,free game only today,True
1,cheapest flight deak,True
2,limited time offer only today only today,True
3,today meeting schedule,False
4,your flight schedule attached,False
5,your credit card statement,False


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   email title  6 non-null      object
 1   spam         6 non-null      bool  
dtypes: bool(1), object(1)
memory usage: 186.0+ bytes


### 정제
: spam 컬럼의 True, False => 1, 0으로 치환

In [12]:
df['label'] = df['spam'].map( # 반복문이다. ㅎㅎㅎ
                            {
                                True:1,
                                False:0
                            }
)# True는 1 그리고 False는 0 으로 간단하게 치환

df

Unnamed: 0,email title,spam,label
0,free game only today,True,1
1,cheapest flight deak,True,1
2,limited time offer only today only today,True,1
3,today meeting schedule,False,0
4,your flight schedule attached,False,0
5,your credit card statement,False,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   email title  6 non-null      object
 1   spam         6 non-null      bool  
 2   label        6 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 234.0+ bytes


In [14]:
# Feature와 Target
data = df['email title']
label = df['label']


베르누이 나이브베이즈의 입력데이터는 고정된 크기의 벡터로 0과 1로 구분된 데이터만 사용 가능.            
CounterVetorizer는 입력된 메일에 출현된 모든 단어의 갯수 만큼의 벡터를 만든뒤 각 이메일의 고정된 벡터로 표현한다.

In [15]:
cv = CountVectorizer(binary=True) # binary는 0과1만 사용. 
data_cv = cv.fit_transform(data)

In [16]:
encoded_input = data_cv.toarray()
encoded_input

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]], dtype=int64)

In [17]:
len(encoded_input[0]) # 컬럼이 17개이다.

17

In [18]:
encoded_input.shape

(6, 17)

In [19]:
# 컬럼 이름
cv.get_feature_names_out() # 전처리 필요, 동사 과거형 진행형 최상급을 동사원형 또는 형용사로 변경하면 좋다.

array(['attached', 'card', 'cheapest', 'credit', 'deak', 'flight', 'free',
       'game', 'limited', 'meeting', 'offer', 'only', 'schedule',
       'statement', 'time', 'today', 'your'], dtype=object)

In [20]:
len(cv.get_feature_names_out())

17

# 벡터로 인코딩 된 제목에 어떤 단어가 있는지 궁금하면?

In [21]:
cv.inverse_transform(encoded_input[0].reshape(1, -1)) # 1차원을 2차원으로 변경, inverse_transform은 2차원만 사용해야 함

[array(['free', 'game', 'only', 'today'], dtype='<U9')]

In [22]:
encoded_input[0] # 1차원

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [23]:
encoded_input[0].reshape(1, -1) # 2차원

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [24]:
# Model
bnb = BernoulliNB()
bnb.fit(data_cv, label)

### Test

In [25]:
test_df = pd.read_csv("../Data/email_test.csv")
test_df

Unnamed: 0,email title,spam
0,free flight offer,True
1,hey traveler free flight deal,True
2,limited free game iffer,True
3,today flight schedule,False
4,your credit card attached,False
5,free credit card offer only today,False


In [26]:
test_df['label'] = test_df['spam'].map({True:1, False:0})
test_data = test_df['email title']
test_target = test_df['label']
test_cv = cv.transform(test_data) # fit은 테스트에서 사용하지 않는다.

In [27]:
pred = bnb.predict(test_cv)
pred

array([1, 1, 1, 0, 0, 1], dtype=int64)

In [28]:
metrics.accuracy_score(test_target, pred)

0.8333333333333334

---
### 다항분포 나이브베이즈 영화리뷰 감정 분류

In [29]:
from sklearn.naive_bayes import MultinomialNB


In [30]:
df = pd.read_csv("../Data/naive_movie.csv")
df

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive
5,shame I wasted money and time,negative
6,regret on this move. I will never never what m...,negative
7,I do not like this movie,negative
8,I do not like actors in this movie,negative
9,boring boring sleeping movie,negative


In [31]:
df.shape

(10, 2)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_review  10 non-null     object
 1   type          10 non-null     object
dtypes: object(2)
memory usage: 292.0+ bytes


In [33]:
# 나이브베이즈 모델은 숫자만 다루므로 숫자로 매핑
df['label'] = df['type'].map(
                        {
                            'positive': 1,
                            'negative': 0
                        }
) # positive를 1로 준것은 긍정적인 것을 더 보겠다는 의미이다. 
df

Unnamed: 0,movie_review,type,label
0,this is great great movie. I will watch again,positive,1
1,I like this movie,positive,1
2,amazing movie in this year,positive,1
3,cool my boyfriend also said the movie is cool,positive,1
4,awesome of the awesome movie ever,positive,1
5,shame I wasted money and time,negative,0
6,regret on this move. I will never never what m...,negative,0
7,I do not like this movie,negative,0
8,I do not like actors in this movie,negative,0
9,boring boring sleeping movie,negative,0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_review  10 non-null     object
 1   type          10 non-null     object
 2   label         10 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes


In [35]:
# 학습을 위해 feature와 target으로 분리
feature = df['movie_review']
target = df['label']

다항분포 나이브베이즈의 입력 데이터는 고정된 크기의 벡터로써 각각의 인덱스는 단어의 빈도수로 구분된 데이터 이어야 합니다.       
CounterVectorizer는 입력된 데이터에 출현된 모든 단어의 갯수만큼의 크기를 벡터로 만든뒤 각각의 리뷰를 고정된 벡터로 표현 합니다. 

In [36]:
cv = CountVectorizer()
traincv = cv.fit_transform(feature)
encoded_input = traincv.toarray()
encoded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [37]:
# 벡터로 인코딩된 영화 리부 단어 확인
cv.get_feature_names_out()

array(['actors', 'again', 'also', 'amazing', 'and', 'awesome', 'boring',
       'boyfriend', 'cool', 'director', 'do', 'ever', 'from', 'great',
       'in', 'is', 'like', 'money', 'move', 'movie', 'my', 'never', 'not',
       'of', 'on', 'regret', 'said', 'shame', 'sleeping', 'the', 'this',
       'time', 'wasted', 'watch', 'what', 'will', 'year'], dtype=object)

In [39]:
# 다항분포 나이브베이즈 분류
mnb = MultinomialNB()
mnb.fit(traincv, target)

In [40]:
# test data
test_df = pd.read_csv("../Data/naive_movie_test.csv")
test_df.head()

Unnamed: 0,movie_review,type
0,great great great movie ever,positive
1,I like this amazing movie,positive
2,my boyfriend said great movie ever,positive
3,cool cool cool,positive
4,awesome boyfriend said cool movie ever,positive


In [41]:
test_df['label'] = test_df['type'].map({'positive':1, 'negative':0})
test_data = test_df['movie_review']
test_target = test_df['label']

In [42]:
testcv = cv.transform(test_data)
pred = mnb.predict(testcv)

In [43]:
metrics.accuracy_score(test_target, pred)

1.0