# Chapter 3 군집화

#### 1. 텍스트의 토큰화
#### 2. 관련 게시물을 발견하기 위해 너무 자주 나타나는 단어 배제
#### 3. 미래의 게시물에서 나타날 빈도가 낮은 단어 배제
#### 4. 남은 단어 세기
#### 5. 전체 텍스트 뭉치를 고려한 TF-IDF 값 계산

## 단어 주머니 만들기

In [2]:
# 텍스트를 단어 주머니로 변환해 주는 클래스
from sklearn.feature_extraction.text import CountVectorizer
# min_df는 최소 빈도수
vectorizer = CountVectorizer(min_df=1)

In [3]:
content = ["How to format my hard disk", " Hard disk format problems"]

# 텍스트 리스트를 벡터화
X = vectorizer.fit_transform(content)
# 총 7개의 단어를 찾음 (최소 빈도수 이상 출현한 단어)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'how', u'my', u'problems', u'to']

In [4]:
# 벡터화 한 배열
print(X.toarray().T)

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


## 단어 세기

In [5]:
import os
import sys

import scipy as sp

from utils import DATA_DIR

print(DATA_DIR)

/home/goofy/문서/projects/private_ubuntu/BuildingMachineLearningSystemsWithPython/ch03/data


In [6]:
# 샘플 파일에서 텍스트 추출

TOY_DIR = os.path.join(DATA_DIR, "toy")
posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)]

print(posts)

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.', 'Imaging databases store data.', 'Imaging databases store data. Imaging databases store data. Imaging databases store data.', 'Most imaging databases save images permanently.\n', 'Imaging databases provide storage capabilities.']


In [7]:
# 기본 CountVectorizer를 이용하여 벡터화
X_train = vectorizer.fit_transform(posts)

num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 25


In [8]:
# 새로운 게시물 추가
new_post = "imaging databases"

# 새로운 게시물을 벡터화, 반환된 값은 희소
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec, type(new_post_vec))
print(new_post_vec.toarray())
print(vectorizer.get_feature_names())

(<1x25 sparse matrix of type '<type 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>, <class 'scipy.sparse.csr.csr_matrix'>)
[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[u'about', u'actually', u'capabilities', u'contains', u'data', u'databases', u'images', u'imaging', u'interesting', u'is', u'it', u'learning', u'machine', u'most', u'much', u'not', u'permanently', u'post', u'provide', u'save', u'storage', u'store', u'stuff', u'this', u'toy']


In [9]:
# 벡터간 거리 측정 메소드

# 단순 유클리드 거리 계산
def dist_raw(v1, v2):
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

# 벡터를 정규화하여 거리 계산
def dist_norm(v1, v2):
    v1_normalized = v1 / sp.linalg.norm(v1.toarray())
    v2_normalized = v2 / sp.linalg.norm(v2.toarray())

    delta = v1_normalized - v2_normalized

    return sp.linalg.norm(delta.toarray())

In [10]:
# 단순 거리 계산 수행
dist = dist_raw

best_dist = sys.maxsize
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    # 신규 포스트는 유사도 검색 대상이므로 스킵
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    # 거리 계산
    d = dist(post_vec, new_post_vec)

    print("=== Post %i with dist=%.2f: %s" % (i, d, post))

    if d < best_dist:
        best_dist = d
        best_i = i

print("Best post is %i with dist=%.2f" % (best_i, best_dist))

=== Post 0 with dist=4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.41: Imaging databases store data.
=== Post 2 with dist=5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.
=== Post 3 with dist=2.00: Most imaging databases save images permanently.

=== Post 4 with dist=1.73: Imaging databases provide storage capabilities.
Best post is 1 with dist=1.41


In [11]:
# Post 1과 2는 동일한 문장이 몇번 반복되었느냐의 차이일 뿐이지만 거리 차는 큼

print(X_train.getrow(1).toarray())
print(X_train.getrow(2).toarray())

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


In [12]:
# 정규화하여 거리 계산
dist = dist_norm

best_dist = sys.maxsize
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    # 신규 포스트는 유사도 검색 대상이므로 스킵
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    # 거리 계산
    d = dist(post_vec, new_post_vec)

    print("=== Post %i with dist=%.2f: %s" % (i, d, post))

    if d < best_dist:
        best_dist = d
        best_i = i

print("Best post is %i with dist=%.2f" % (best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.77: Imaging databases store data.
=== Post 2 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
=== Post 3 with dist=0.92: Most imaging databases save images permanently.

=== Post 4 with dist=0.86: Imaging databases provide storage capabilities.
Best post is 1 with dist=0.77


## 덜 중요한 단어 삭제

In [13]:
# sudo pip install nltk 으로 설치 가능
# nltk : Natural Language Toolkit
import nltk.stem

# 어근 추출
english_stemmer = nltk.stem.SnowballStemmer('english')

In [14]:
# 어근 추출 테스트
print("{0} : {1}".format('graphics', english_stemmer.stem('graphics')))
print("{0} : {1}".format('buying', english_stemmer.stem('buying')))
print("{0} : {1}".format('imagination', english_stemmer.stem('imagination')))

graphics : graphic
buying : buy
imagination : imagin


In [15]:
# 어근 추출 전처리기를 포함한 CountVectorizer 선언
class StemmedCountVectorizer(CountVectorizer):
    # CountVectorizer의 build_analyzer 메소드를 overwrite (override??)
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

# vectorizer = CountVectorizer(min_df=1, stop_words='english',
# preprocessor=stemmer)
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [16]:
# 어근 추출 전처리기를 이용하여 벡터화
X_train = vectorizer.fit_transform(posts)

num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 17


In [17]:
# 왜 features의 수가 줄어들었나?
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [18]:
print(vectorizer.get_feature_names())

[u'actual', u'capabl', u'contain', u'data', u'databas', u'imag', u'interest', u'learn', u'machin', u'perman', u'post', u'provid', u'save', u'storag', u'store', u'stuff', u'toy']


### 강화된 불용어 (stop words)

In [19]:
# 용어(단어) 빈도-역 문서 빈도 TF-IDF, term frequency-inverse document frequency 구현 예
def tfidf(t, d, D):
    # 단어 t가 문서 d 내에서 출현한 빈도 계산
    tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
    # 단어 t가 포함된 문서의 빈도 계산
    idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc])))
    # 단어가 문서 내에서 출현한 빈도와 단어가 포함된 문서의 빈도의 곱을 반환
    # 문서 집합 D에서 단어 t가 얼마만큼의 중요도를 가지는가
    return tf * idf


a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]

print(tfidf("a", a, D))
print(tfidf("b", abb, D))
print(tfidf("a", abc, D))
print(tfidf("b", abc, D))
print(tfidf("c", abc, D))

0.0
0.270310072072
0.0
0.135155036036
0.366204096223


In [20]:
# 용어 빈도-역 문서 빈도 TF-IDF, term frequency-inverse document frequency
from sklearn.feature_extraction.text import TfidfVectorizer

class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(
    min_df=1, stop_words='english', decode_error='ignore')

In [21]:
# TF-IDF 전처리기를 이용하여 벡터화
X_train = vectorizer.fit_transform(posts)

num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 17


In [22]:
# Post 1과 2의 거리차가 없어짐

print(X_train.getrow(1).toarray())
print(X_train.getrow(2).toarray())

[[ 0.          0.          0.          0.57974759  0.40483667  0.40483667
   0.          0.          0.          0.          0.          0.          0.
   0.          0.57974759  0.          0.        ]]
[[ 0.          0.          0.          0.57974759  0.40483667  0.40483667
   0.          0.          0.          0.          0.          0.          0.
   0.          0.57974759  0.          0.        ]]


In [23]:
# 새로운 게시물 추가
new_post = "imaging databases"

# 새로운 게시물을 벡터화, 반환된 값은 희소
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec, type(new_post_vec))
print(new_post_vec.toarray())
print(vectorizer.get_feature_names())

(<1x17 sparse matrix of type '<type 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>, <class 'scipy.sparse.csr.csr_matrix'>)
[[ 0.          0.          0.          0.          0.70710678  0.70710678
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]]
[u'actual', u'capabl', u'contain', u'data', u'databas', u'imag', u'interest', u'learn', u'machin', u'perman', u'post', u'provid', u'save', u'storag', u'store', u'stuff', u'toy']


In [24]:
# 정규화하여 거리 계산
dist = dist_norm

best_dist = sys.maxsize
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    # 신규 포스트는 유사도 검색 대상이므로 스킵
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    # 거리 계산
    
    v1_normalized = post_vec / sp.linalg.norm(post_vec.toarray())
    v2_normalized = new_post_vec / sp.linalg.norm(new_post_vec.toarray())
    d = dist(post_vec, new_post_vec)

    print("=== Post %i with dist=%.2f: %s" % (i, d, post))

    if d < best_dist:
        best_dist = d
        best_i = i

print("Best post is %i with dist=%.2f" % (best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.92: Imaging databases store data.
=== Post 2 with dist=0.92: Imaging databases store data. Imaging databases store data. Imaging databases store data.
=== Post 3 with dist=0.86: Most imaging databases save images permanently.

=== Post 4 with dist=1.08: Imaging databases provide storage capabilities.
Best post is 3 with dist=0.86


## 하지만
### 단어간의 관련성을 다룰 수 없다.
### 부정적인 의미를 인지할 수 없다.
### 오기(오타)는 치명적이다.