# Naive Bayes

### 스팸메일 분류

---

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import MultinomialNB as MNB


#### 데이터 로드

In [None]:
df = pd.read_csv('data/spam.csv', encoding='utf-8')

In [None]:
df

In [None]:
df.info()

In [None]:
df['type'].value_counts()

In [None]:
corpus = df['text']

#### 데이터 전처리

In [None]:
# 5개 이하로 나온 단어 무시
vectorizer = CountVectorizer(min_df=5)

In [None]:
# 문서-단어 행렬 작성 후 X에 저장
# 단어의 총 출연 횟수를 text에 등장한 단어 순서에 맞게 저장 - CounterVector

X = vectorizer.fit_transform(corpus)

In [None]:
X.shape

In [None]:
type(X)

In [None]:
print(X[:1]) # countervector 임의 출력

In [None]:
# array type으로 변경
X = X.toarray()

In [None]:
# Target Data ( spam or ham )
y = df['type']

#### 데이터 분리: 학습 데이터 + 테스트 데이터

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

### 1.  Gaussian Naive Bayes

In [None]:
# 모델 생성
model = GNB()

In [None]:
# 모델 학습
model.fit(X_train, y_train)

In [None]:
# 결과 예측
prediction1 = model.predict(X_test)
prediction1

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction1))

In [None]:
# 정확도 확인
print('Accuracy - Gaussian Naive Bayes:', metrics.accuracy_score(prediction1, y_test))

In [None]:
# Confusion Matrix
pd.crosstab(prediction1, y_test, margins=True)

### 2. Multimomial  Naive Bayes

In [None]:
# 모델 생성
model = MNB()

In [None]:
# 모델 학습
model.fit(X_train, y_train)

In [None]:
# 결과 예측
prediction2 = model.predict(X_test)
prediction2

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction2))

In [None]:
# 정확도 확인
print('Accuracy - Multimomial Naive Bayes:', metrics.accuracy_score(prediction2, y_test))

In [None]:
# Confusion Matrix
pd.crosstab(prediction2, y_test, margins=True)

---

In [None]:
# end of file