# 문서 분류 (Document Classification)

# 1 나이브 베이즈 분류(Naive Bayes Classifier)

## 1.1 직접구현

### Naive Bayes Classifier

In [None]:
training_set = [
    ['me free lottery', 1],
    ['free get free you', 1],
    ['you free scholarship', 0],
    ['free to contact me', 0],
    ['you won award', 0],
    ['you ticket lottery', 1]
]

### 토큰 빈도수 및 문서별 토큰수 계산 (확률 계산을 위한 준비)

![대체 텍스트](https://wikimedia.org/api/rest_v1/media/math/render/svg/98f086c560aa2f66650060277dda4f90e54e30c0)

In [None]:
from collections import defaultdict

# 범주에 속하는 토큰수 세기 1(스팸), 0(정상)
doccnt0 = 0
doccnt1 = 0

# 토큰별로 문서내 빈도수 카운팅
wordfreq = defaultdict(lambda : [0, 0])
for doc, label in training_set:
    words = doc.split()
    for word in words :
        wordfreq[word][label] += 1
          
for key, (cnt0, cnt1) in wordfreq.items():
    doccnt0 += cnt0
    doccnt1 += cnt1
  
wordfreq

In [None]:
doccnt0

In [None]:
doccnt1

### Training : 토큰별 조건부 확률 계산 

In [None]:
k = 0.5

wordprobs = defaultdict(lambda : [0, 0])
for key, (cnt0, cnt1) in wordfreq.items() :
    wordprobs[key][0] = (cnt0 + k) / (doccnt0 + 2*k)
    wordprobs[key][1] = (cnt1 + k) / (doccnt1 + 2*k)

wordprobs

### Classify : 신규 텍스트가 주어졌을 때 확률 계산

In [None]:
import math

doc = "free lottery"
tokens = doc.split()

# 초기값은 모두 0으로 처리
log_prob1 = log_prob0 = 0.0

# 모든 단어에 대해 반복
for word, (prob1, prob0) in wordprobs.items():
    if word in tokens:
        log_prob1 += math.log(prob1)
        log_prob0 += math.log(prob0)

log_prob0 += math.log(doccnt0/(doccnt0 + doccnt1))    
log_prob1 += math.log(doccnt1/(doccnt0 + doccnt1))
  
prob0 = math.exp(log_prob0)
prob1 = math.exp(log_prob1)

print(doc)
print("정상확률 : {}%".format(prob0 / (prob0 + prob1)*100))
print("스팸확률 : {}%".format(prob1 / (prob0 + prob1)*100))

In [None]:
log_prob0

In [None]:
prob0

In [None]:
prob1

## 1.2 sklearn 활용 (영문 뉴스 분류)

- naive_bayes.MultinomialNB() : 빈도수 기반 Naive Bayse Classifier

### 뉴스 데이터 다운로드



In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴스 데이터 출력

### 문서 분류(파이프 라인 사용)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()), 
])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
CountVectorizer

In [None]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_clf = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
}
gs_clf = GridSearchCV(text_clf, parameters_clf, n_jobs=-1, verbose=2)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print("Best score: {0}".format(gs_clf.best_score_))  
print("Best parameters set:")  
best_parameters = gs_clf.best_estimator_.get_params()  
for param_name in sorted(list(best_parameters.keys())):  
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

### Parameter 적용

In [None]:
import numpy as np
predicted = gs_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

# 2 서포트 벡터 머신(SVM, Support Vector Machine)

- linear_model.SGDClassifier() : 선형 경사하강법 분류 모델

### 뉴스 데이터 다운로드

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴데 이터 출력

### 문서 분류 (파이프 라인 사용)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter_no_change=5, random_state=42)),
])
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

In [None]:
print("Best score: {0}".format(gs_clf_svm.best_score_))  
print("Best parameters set:")  
best_parameters = gs_clf_svm.best_estimator_.get_params()  
for param_name in sorted(list(best_parameters.keys())):  
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))