In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

!pip install konlpy
import konlpy
from konlpy.tag import Okt, Kkma, Mecab

import re

import sklearn
from sklearn.feature_extraction.text import CountVectorizer


print(f'\n\n>>> nltk : {nltk.__version__}  |  konlpy : {konlpy.__version__}  |  sklearn : {sklearn.__version__}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 6.0MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.2MB/s 
[?25hCollecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.

## 한글 Bag of words
- Bag of Words : 단어들의 순서는 전혀 고려하지 않고, 단어들의 출현 빈도(frequency)에만 집중하는 텍스트 데이터의 수치화 표현 방법
- BoW를 만드는 과정<br>
  1) 각 단어에 고유한 정수 인덱스 부여
  2) 각 인덱스의 위치에 단어 토큰의 등장 횟수를 기록한 벡터 생성

In [2]:
sent1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
sent2 = "소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다."

# 전처리 - 정규 표현식을 통해 온점 제거
sent1 = re.sub("(\.)", "", sent1)
sent2 = re.sub("(\.)", "", sent2)

# OKT 형태소 분석기를 통해 토큰화 수행
okt = Okt()
tokens1 = okt.morphs(sent1)
tokens2 = okt.morphs(sent2)

# 토큰별 index 생성 및 빈도 계산
def get_bow(tokens):
  word2index = {}
  bow = []
  for tk in tokens:
          if tk not in word2index.keys():
              word2index[tk] = len(word2index)
              bow.insert(len(word2index)-1,1)
          else:
              index = word2index.get(tk)
              bow[index]=bow[index]+1
  return word2index, bow


for tmp_tokens in [tokens1, tokens2]:
  tmp_dict, tmp_list = get_bow(tmp_tokens)
  print('word2index :', tmp_dict)
  print('bow :', tmp_list)
  print()

word2index : {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
bow : [1, 2, 1, 1, 2, 1, 1, 1, 1, 1]

word2index : {'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
bow : [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]



## 영어 Bag of words (CountVectorizer)
- 사이킷런의 CountVectorizer는 띄어쓰기만을 기준으로 단어를 자르는 낮은 수준의 토큰화 지원
- tokenizer 특성상 한글에는 적용 무리

In [3]:
text = ['you know I want your love. because I love you.']
vectorizer = CountVectorizer()

bow_arr = vectorizer.fit_transform(text)
print(bow_arr.toarray())  # convert CSR to array
print(vectorizer.vocabulary_)

[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


In [4]:
# build Bag of words with custom stopwords list
text = ["Family is not an important thing. It's everything."]
vectorizer = CountVectorizer(stop_words=['the', 'a', 'an', 'is', 'not'])

bow_arr = vectorizer.fit_transform(text)
print(bow_arr.toarray())  # convert CSR to array
print(vectorizer.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [5]:
# build Bag of words with sklearn stopwords list
text = ["Family is not an important thing. It's everything."]
vectorizer = CountVectorizer(stop_words='english')  # sklearn Doc said "There are several known issues with 'english' and you should consider an alternative"

bow_arr = vectorizer.fit_transform(text)
print(bow_arr.toarray())  # convert CSR to array
print(vectorizer.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [6]:
# build Bag of words with nltk stopwords list
text = ["Family is not an important thing. It's everything."]
sw = stopwords.words('english')
print(len(sw))  # stopwords vocab size
vectorizer = CountVectorizer(stop_words=sw)

bow_arr = vectorizer.fit_transform(text)
print(bow_arr.toarray())  # convert CSR to array
print(vectorizer.vocabulary_)

179
[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
