# 문서 분류 (Document Classification)

# 1 나이브 베이즈 분류(Naive Bayes Classifier)

## 1.1 직접구현

### Naive Bayes Classifier

In [1]:
training_set = [
    ['me free lottery', 1],
    ['free get free you', 1],
    ['you free scholarship', 0],
    ['free to contact me', 0],
    ['you won award', 0],
    ['you ticket lottery', 1]
]

### 토큰 빈도수 및 문서별 토큰수 계산 (확률 계산을 위한 준비)

![대체 텍스트](https://wikimedia.org/api/rest_v1/media/math/render/svg/98f086c560aa2f66650060277dda4f90e54e30c0)

In [54]:
unique_tokens = []
token_dict = {
    0:[],
    1:[]
}
for sent, label in training_set:
    tokens = sent.split()
    unique_tokens.extend(tokens)
    token_dict[label].extend(tokens)

token_dict_sub={0:'normal', 1:'spam'}
token_dict = dict((token_dict_sub[key], value) for (key, value) in token_dict.items())
print(token_dict, set(unique_tokens), sep= '\n')    

{'normal': ['you', 'free', 'scholarship', 'free', 'to', 'contact', 'me', 'you', 'won', 'award'], 'spam': ['me', 'free', 'lottery', 'free', 'get', 'free', 'you', 'you', 'ticket', 'lottery']}
{'me', 'get', 'award', 'to', 'won', 'lottery', 'ticket', 'contact', 'free', 'scholarship', 'you'}


In [56]:
token2idx = {token : i for i, token in enumerate(set(unique_tokens))}

In [57]:
print(token2idx)

{'me': 0, 'get': 1, 'award': 2, 'to': 3, 'won': 4, 'lottery': 5, 'ticket': 6, 'contact': 7, 'free': 8, 'scholarship': 9, 'you': 10}


### Training : 토큰별 조건부 확률 계산

In [59]:
prior_lst = []
for token in token2idx:
    token0=token_dict['normal'].count(token)
    token1=token_dict['spam'].count(token)
    prior_lst.append((token0, token1))

prior_lst

[(1, 1),
 (0, 1),
 (1, 0),
 (1, 0),
 (1, 0),
 (0, 2),
 (0, 1),
 (1, 0),
 (2, 3),
 (1, 0),
 (2, 2)]

In [60]:
import pandas as pd

In [62]:
nb_df = pd.DataFrame(prior_lst, index=token2idx, columns=token_dict.keys())

In [63]:
nb_df

Unnamed: 0,normal,spam
me,1,1
get,0,1
award,1,0
to,1,0
won,1,0
lottery,0,2
ticket,0,1
contact,1,0
free,2,3
scholarship,1,0


In [64]:
label_cols = nb_df.columns

In [65]:
k=0.5

for label in label_cols:
    nb_df[f'p_{label}']=(nb_df[label] + k)/ (nb_df[label].sum() + 2*k)

nb_df

Unnamed: 0,normal,spam,p_normal,p_spam
me,1,1,0.136364,0.136364
get,0,1,0.045455,0.136364
award,1,0,0.136364,0.045455
to,1,0,0.136364,0.045455
won,1,0,0.136364,0.045455
lottery,0,2,0.045455,0.227273
ticket,0,1,0.045455,0.136364
contact,1,0,0.136364,0.045455
free,2,3,0.227273,0.318182
scholarship,1,0,0.136364,0.045455


In [66]:
import numpy as np

In [67]:
for label in label_cols:
    nb_df[f'log_p_{label}'] = np.log(nb_df[f'p_{label}'])

nb_df

Unnamed: 0,normal,spam,p_normal,p_spam,log_p_normal,log_p_spam
me,1,1,0.136364,0.136364,-1.99243,-1.99243
get,0,1,0.045455,0.136364,-3.091042,-1.99243
award,1,0,0.136364,0.045455,-1.99243,-3.091042
to,1,0,0.136364,0.045455,-1.99243,-3.091042
won,1,0,0.136364,0.045455,-1.99243,-3.091042
lottery,0,2,0.045455,0.227273,-3.091042,-1.481605
ticket,0,1,0.045455,0.136364,-3.091042,-1.99243
contact,1,0,0.136364,0.045455,-1.99243,-3.091042
free,2,3,0.227273,0.318182,-1.481605,-1.145132
scholarship,1,0,0.136364,0.045455,-1.99243,-3.091042


### Classify : 신규 텍스트가 주어졌을 때 확률 계산

In [69]:
tar_text = 'free lottery'
tar_tokens = tar_text.split()
log_event_prob = []
for label in label_cols:
    log_event_prob.append(np.log(nb_df[label].sum()/nb_df[label_cols].sum().sum()))

log_event_prob

log_sum_norm = log_event_prob[0]
log_sum_spam = log_event_prob[1]
for token in tar_tokens:
    log_sum_norm+=nb_df['log_p_normal'][token]
    log_sum_spam+=nb_df['log_p_spam'][token]

spam_prob = np.exp(log_sum_spam)/ (np.exp(log_sum_spam)+np.exp(log_sum_norm))
norm_prob = 1-spam_prob
spam_prob, norm_prob


(0.8749999999999999, 0.1250000000000001)

## 1.2 sklearn 활용 (영문 뉴스 분류)

- naive_bayes.MultinomialNB() : 빈도수 기반 Naive Bayse Classifier

### 뉴스 데이터 다운로드



In [42]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴스 데이터 출력

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have o

### 문서 분류(파이프 라인 사용)

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [44]:
CountVectorizer

sklearn.feature_extraction.text.CountVectorizer

In [46]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
accuracy = np.mean(predicted == twenty_test.target)

In [47]:
accuracy

0.7738980350504514

### Grid Search

In [48]:
from sklearn.model_selection import GridSearchCV
parameters_clf = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
}
gs_clf = GridSearchCV(text_clf, parameters_clf, n_jobs=-1, verbose=2)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print("Best score: {0}".format(gs_clf.best_score_))
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.8518650274101537
Best parameters set:
	clf: MultinomialNB()
	clf__alpha: 1.0
	clf__class_prior: None
	clf__fit_prior: True
	memory: None
	steps: [('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]
	tfidf: TfidfTransformer()
	tfidf__norm: l2
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect: CountVectorizer(ngram_range=(1, 2))
	vect__analyzer: word
	vect__binary: False
	vect__decode_error: strict
	vect__dtype: <class 'numpy.int64'>
	vect__encoding: utf-8
	vect__input: content
	vect__lowercase: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__min_df: 1
	vect__ngram_range: (1, 2)
	vect__preprocessor: None
	vect__stop_words: None
	vect__strip_accents: None
	vect__token_pattern: (?u)\b\w\w+\b
	vect__tokenizer: None
	vect__vocabulary: None
	verbose: False


### Parameter 적용

In [49]:
import numpy as np
predicted = gs_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.765400955921402

# 2 서포트 벡터 머신(SVM, Support Vector Machine)

- linear_model.SGDClassifier() : 선형 경사하강법 분류 모델

### 뉴스 데이터 다운로드

In [50]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴데 이터 출력

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have o

### 문서 분류 (파이프 라인 사용)

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter_no_change=5, random_state=42)),
])
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

### Grid Search

In [52]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [53]:
print("Best score: {0}".format(gs_clf_svm.best_score_))
print("Best parameters set:")
best_parameters = gs_clf_svm.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

Best score: 0.9051618841994754
Best parameters set:
	clf-svm: SGDClassifier(alpha=0.001, random_state=42)
	clf-svm__alpha: 0.001
	clf-svm__average: False
	clf-svm__class_weight: None
	clf-svm__early_stopping: False
	clf-svm__epsilon: 0.1
	clf-svm__eta0: 0.0
	clf-svm__fit_intercept: True
	clf-svm__l1_ratio: 0.15
	clf-svm__learning_rate: optimal
	clf-svm__loss: hinge
	clf-svm__max_iter: 1000
	clf-svm__n_iter_no_change: 5
	clf-svm__n_jobs: None
	clf-svm__penalty: l2
	clf-svm__power_t: 0.5
	clf-svm__random_state: 42
	clf-svm__shuffle: True
	clf-svm__tol: 0.001
	clf-svm__validation_fraction: 0.1
	clf-svm__verbose: 0
	clf-svm__warm_start: False
	memory: None
	steps: [('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(alpha=0.001, random_state=42))]
	tfidf: TfidfTransformer()
	tfidf__norm: l2
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect: CountVectorizer(ngram_range=(1, 2))
	vect__analyzer: word
	vect__bin