<a href="https://colab.research.google.com/github/sabumjung/Machine-Learning-Algorithm/blob/master/ch12_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
# For reproducibility
np.random.seed(1000)

ret = RegexpTokenizer('[a-zA-Z0-9\']+')
sw = set(stopwords.words('english'))
ess = SnowballStemmer('english', ignore_stopwords=True)


def tokenizer(sentence):
    tokens = ret.tokenize(sentence)
    return [ess.stem(t) for t in tokens if t not in sw]

In [22]:
# Create a corpus
corpus = [
    'This is a simple test corpus',
    'A corpus is a set of text documents',
    'We want to analyze the corpus and the documents',
    'Documents can be automatically tokenized'
]

# Create a count vectorizer
print('Count vectorizer:')
cv = CountVectorizer()

vectorized_corpus = cv.fit_transform(corpus)
print(vectorized_corpus.todense())

print('CV Vocabulary:')
print(cv.vocabulary_)

# Perform an inverse transformation
vector = [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1]
print(cv.inverse_transform(vector))

# Use a complete external tokenizer
print('CV with external tokenizer:')
cv = CountVectorizer(tokenizer=tokenizer)
vectorized_corpus = cv.fit_transform(corpus)
print(vectorized_corpus.todense())

# Use an n-gram range equal to (1, 2)
print('CV witn n-gram range (1, 2):')
cv = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2))
vectorized_corpus = cv.fit_transform(corpus)
print(vectorized_corpus.todense())

print('N-gram range (1,2) vocabulary:')
print(cv.vocabulary_)

# Create a Tf-Idf vectorizer
print('Tf-Idf vectorizer:')
tfidfv = TfidfVectorizer()
vectorized_corpus = tfidfv.fit_transform(corpus)
print(vectorized_corpus.todense())

print('Tf-Idf vocabulary:')
print(tfidfv.vocabulary_)

# Use n-gram range equal to (1, 2) and L2 normalization
print('Tf-Idf witn n-gram range (1, 2) and L2 normalization:')
tfidfv = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), norm='l2')
vectorized_corpus = tfidfv.fit_transform(corpus)
print(vectorized_corpus.todense())

Count vectorizer:
[[0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0]
 [1 1 0 0 0 1 1 0 0 0 0 0 0 2 0 1 0 1 1]
 [0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0]]
CV Vocabulary:
{'this': 14, 'is': 7, 'simple': 10, 'test': 11, 'corpus': 5, 'set': 9, 'of': 8, 'text': 12, 'documents': 6, 'we': 18, 'want': 17, 'to': 15, 'analyze': 0, 'the': 13, 'and': 1, 'can': 4, 'be': 3, 'automatically': 2, 'tokenized': 16}
[array(['corpus', 'is', 'simple', 'test', 'this', 'want', 'we'],
      dtype='<U13')]
CV with external tokenizer:
[[0 0 1 0 0 1 1 0 0 0]
 [0 0 1 1 1 0 0 1 0 0]
 [1 0 1 1 0 0 0 0 0 1]
 [0 1 0 1 0 0 0 0 1 0]]
CV witn n-gram range (1, 2):
[[0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0]
 [0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 1 0 0 0]
 [1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0]]
N-gram range (1,2) vocabulary:
{'simpl': 11, 'test': 13, 'corpus': 4, 'simpl test': 12, 'test corpus': 14, 'set': 9, 'text': 15, 'document': 7, 'corpus set': 6, '