# Word Counts with CountVectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
text = ["The quick brown fox jumped over the lazy dog."]

vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [5]:
vector = vectorizer.transform(text)

In [6]:
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [8]:
text2 = ['the puppy']
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 0 0 0 0 0 1]]


# Word Frequencies with TfidfVectorizer

- one issue with WordCount, some common words like 'the' will have large counts and not be very meaningful in the encoded vectors.

## TFIDF

- `Term Frequency` - summarizes how often a given word appears within a document.
- `Inverse Document Frequency` - downscales words that appear a lot across documents. เพราะคำอย่าง the มันมีเยอะ จึงต้อง inverse เพื่อให้มีคำที่น่าสนใจ


ใช้กับ CountVectorizer ได้

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = ["The quick brown fox jumped over the lazy dog.",
       "The dog.",
       "The fox"]

In [11]:
vectorizer = TfidfVectorizer()
vectorizer.fit(text)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[ 1.69314718  1.28768207  1.28768207  1.69314718  1.69314718  1.69314718
  1.69314718  1.        ]


เลข 1 ใน index=7 คือคำว่า 'the'

In [14]:
vector = vectorizer.transform([text[0]]) # encode the first document
print(vector.shape)
print(vector.toarray()) # final score

(1, 8)
[[ 0.36388646  0.27674503  0.27674503  0.36388646  0.36388646  0.36388646
   0.36388646  0.42983441]]


# Hashing wit HashingVectorizer

from the 2 above vectorizer, the vocabulary can become very large. so hash them to integer would reduce the size.

Advantage
* no vocabulary required.

Disadvantage
* it's a one-way hash so we cannot refer back to the original word

In [15]:
from sklearn.feature_extraction.text import HashingVectorizer

In [16]:
text = ["The quick brown fox jumped over the lazy dog."]

In [17]:
vectorizer = HashingVectorizer(n_features=20) # arbitory number. this is the range of the hash function.

In [18]:
vector = vectorizer.transform(text)

In [19]:
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]
