In [2]:
# 단어 빈도를 벡터화
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
data = [
    "I am a boy",
    "I like Linux",
    "The ML is so difficult to learn.",
    "But it is funny."
]

In [52]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(data)

In [60]:
arr = vectorizer.transform(data).toarray()
print(arr.shape)
print(arr)
# 행: 문서 수
# 열: 문서 전체의 단어 수

(4, 14)
[[1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [0 0 0 1 0 1 0 1 0 0 1 1 1 1]
 [0 0 1 0 1 1 1 0 0 0 0 0 0 0]]


In [64]:
print(vectorizer.get_feature_names_out()) # 기본 tokenizer가 2개 이상의 문자로 구성된 단어만 추출하도록 되어있음
print(vectorizer.get_feature_names_out().shape)

['am' 'boy' 'but' 'difficult' 'funny' 'is' 'it' 'learn' 'like' 'linux'
 'ml' 'so' 'the' 'to']
(14,)


In [66]:
vectorizer.vocabulary_

{'am': 0,
 'boy': 1,
 'like': 8,
 'linux': 9,
 'the': 12,
 'ml': 10,
 'is': 5,
 'so': 11,
 'difficult': 3,
 'to': 13,
 'learn': 7,
 'but': 2,
 'it': 6,
 'funny': 4}

In [74]:
# 1개짜리 문자도 추출하도록 설정하기
vectorizer = CountVectorizer(token_pattern=r'(?u)\w+')
data_vectorized = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())

['a' 'am' 'boy' 'but' 'difficult' 'funny' 'i' 'is' 'it' 'learn' 'like'
 'linux' 'ml' 'so' 'the' 'to']


In [87]:
# 한글 처리
data_ko = [
    '안녕하세요. 저의 이름은 송준우입니다.',
    '안녕? 내 친구의 이름은 이눅스다.',
    '반가워 내 고양이의 이름은 포뇨다.',
    '안녕! 내 동생은 강아지고 이름은 볼트다.'
]

In [85]:
vectorizer = CountVectorizer(token_pattern=r'(?u)\w+')
data_vectorized = vectorizer.fit_transform(data_ko)
print(vectorizer.get_feature_names_out())

['강아지고' '고양이의' '내' '동생은' '반가워' '볼트다' '송준우입니다' '안녕' '안녕하세요' '이눅스다' '이름은'
 '저의' '친구의' '포뇨다']


In [89]:
print(vectorizer.vocabulary_)

{'안녕하세요': 8, '저의': 11, '이름은': 10, '송준우입니다': 6, '안녕': 7, '내': 2, '친구의': 12, '이눅스다': 9, '반가워': 4, '고양이의': 1, '포뇨다': 13, '동생은': 3, '강아지고': 0, '볼트다': 5}


In [95]:
!pip install konlpy # 또는 !curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp312-cp312-macosx_10_9_universal2.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading jpype1-1.5.2-cp312-cp312-macosx_10_9_universal2.whl (583 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m583.4/583.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0


In [99]:
# 한글 형태소 분석기
from konlpy.tag import Komoran

In [108]:
komoran = Komoran()
vectorizer_ko = CountVectorizer(tokenizer=komoran.nouns)
vectorizer_ko.fit(data_ko)
print(vectorizer_ko.vocabulary_)

{'안녕하세요': 5, '이름': 7, '송': 4, '우': 6, '친구': 8, '고양이': 1, '포뇨': 9, '동생': 2, '강아지': 0, '볼트': 3}


In [112]:
from konlpy.tag import Okt
okt = Okt()
vectorizer_ko = CountVectorizer(tokenizer=okt.morphs)
vectorizer_ko.fit(data_ko)
print(vectorizer_ko.vocabulary_)

{'안녕하세요': 13, '.': 1, '저': 19, '의': 15, '이름': 17, '은': 14, '송': 11, '준우': 20, '입니다': 18, '안녕': 12, '?': 2, '내': 6, '친구': 21, '이눅스': 16, '다': 7, '반가워': 9, '고양이': 5, '포뇨': 22, '!': 0, '동생': 8, '강아지': 3, '고': 4, '볼트': 10}
