# Chapter 4. 카운트 기반의 문서표현



## 4.1 카운트 기반 문서 표현의 개념 : BOW

## 4.2 BOW 기반의 카운트 벡터 생성

In [3]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/rose/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
from nltk.corpus import movie_reviews

print('#review count: ', len(movie_reviews.fileids()))
print('#samples of file ids: ', movie_reviews.fileids()[:10])
print('#categories of reviews: ', movie_reviews.categories())
print('#Num of "neg" reviews: ', len(movie_reviews.fileids(categories='neg')))
print('#Num of "pos" reviews: ', len(movie_reviews.fileids(categories='pos')))

fileid = movie_reviews.fileids()[0]
print('#id of the first review: ', fileid)
print('#first review content: \n', movie_reviews.raw(fileid)[:200])
print('#sentence tokenization result: ', movie_reviews.sents(fileid)[:2])
print('#word tokenization result: ', movie_reviews.words(fileid)[:20])

#review count:  2000
#samples of file ids:  ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
#categories of reviews:  ['neg', 'pos']
#Num of "neg" reviews:  1000
#Num of "pos" reviews:  1000
#id of the first review:  neg/cv000_29416.txt
#first review content: 
 plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
w
#sentence tokenization result:  [['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.']]
#word tokenization result:  ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 

In [5]:
documents = [list(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]
print(documents[0][:50])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch']


In [6]:
word_count = {}
for text in documents:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key=word_count.get, reverse=True)
for word in sorted_features[:10]:
    print(f"count of '{word}': {word_count[word]}", end=', ')

count of ',': 77717, count of 'the': 76529, count of '.': 65876, count of 'a': 38106, count of 'and': 35576, count of 'of': 34123, count of 'to': 31937, count of ''': 30585, count of 'is': 25195, count of 'in': 21822, 

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
tokens = [[token for token in tokenizer.tokenize(doc) if token not in english_stops] for doc in documents]

word_count = {}
for text in tokens:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1
        
sorted_features = sorted(word_count, key=word_count.get, reverse=True)

print('#Num of features: ', len(sorted_features))
for word in sorted_features[:10]:
    print(f"count of '{word}': {word_count[word]}", end=', ')

#Num of features:  43030
count of 'film': 8935, count of 'one': 5791, count of 'movie': 5538, count of 'like': 3690, count of 'even': 2564, count of 'time': 2409, count of 'good': 2407, count of 'story': 2136, count of 'would': 2084, count of 'much': 2049, 

In [8]:
#빈도가 높은 상위 1000개의 단어만 추출해 features를 구성
word_features = sorted_features[:1000]

In [9]:
#주어진 document를 feature로 변환하는 함수
def document_features(document, word_features):
    word_count = {}
    for word in document:
        word_count[word] = word_count.get(word, 0) + 1
    
    features = []
    for word in word_features:
        features.append(word_count.get(word, 0))
    
    return features

word_features_ex = ['one', 'two', 'teen', 'couples', 'solo']
doc_ex = ['two', 'two', 'couples']
print(document_features(doc_ex, word_features_ex))

[0, 2, 0, 1, 0]


In [11]:
feature_sets = [document_features(d, word_features) for d in tokens]

for i in range(20):
    print(f'({word_features[i]}, {feature_sets[0][i]})', end=', ')

(film, 5), (one, 3), (movie, 6), (like, 3), (even, 3), (time, 0), (good, 2), (story, 0), (would, 1), (much, 0), (also, 1), (get, 3), (character, 1), (two, 2), (well, 1), (first, 0), (characters, 1), (see, 2), (way, 3), (make, 5), 

In [13]:
print(feature_sets[0][-20:])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## 4.3 사이킷런을 이용한 카운트 벡터 생성

### CountVectorizer

http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction  

## 4.4 한국어 텍스트의 카운트 벡터 변환


## 4.5 카운트 벡터의 활용

### 코사인 유사도(Cosine similarity)

## 4.6 TF-IDF로 성능을 높여보자