In [112]:
import pandas as pd

### Word Counts with CountVectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# list of text documents 
text = ["The quick brown fox jumped over the lazy dog."] 

In [3]:
# create the transform 
vectorizer = CountVectorizer() 

In [4]:
# tokenize and build vocab 
vectorizer.fit(text) 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [5]:
# summarize 
print(vectorizer.vocabulary_) 

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [6]:
# encode document 
vector = vectorizer.transform(text) 

In [8]:
# summarize encoded vector 
print(vector.shape) 

(1, 8)


In [9]:
print(type(vector)) 

<class 'scipy.sparse.csr.csr_matrix'>


In [10]:
print(vector.toarray())

[[1 1 1 1 1 1 1 2]]


In [11]:
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


### Word Frequencies with TfidfVectorizer

#### How tfidf works?
Consider a document containing 100 words wherein the word cat appears 3 times. The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# list of text documents 
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox",
       "the the the",
       "dog"] 

In [13]:
# create the transform 
vectorizer = TfidfVectorizer() 

In [53]:
# tokenize and build vocab 
vectorizer.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [54]:
# summarize 
print(vectorizer.vocabulary_) 

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [55]:
print(vectorizer.idf_) 

[2.09861229 1.40546511 1.69314718 2.09861229 2.09861229 2.09861229
 2.09861229 1.18232156]


In [60]:
# encode document 
vector = vectorizer.transform([text[2]]) 

In [61]:
# summarize encoded vector 
print(vector.shape) 

print(vector.toarray())

(1, 8)
[[0.         0.         0.8198869  0.         0.         0.
  0.         0.57252551]]


### Hashing with HashingVectorizer

In [64]:
from sklearn.feature_extraction.text import HashingVectorizer 

# list of text documents 
text = ["The quick brown fox jumped over the lazy dog."] 

In [65]:
# create the transform 
vectorizer = HashingVectorizer(n_features=20) 

# encode document 
vector = vectorizer.transform(text) 

In [66]:
# summarize encoded vector 
print(vector.shape) 
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


### Keras: Encoding one_hot

In [97]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence 

# define the document 
text = "The quick brown fox jumped over the lazy dog."

In [98]:
# estimate the size of the vocabulary 
words = set(text_to_word_sequence(text)) 
vocab_size = len(words) 
print(vocab_size) 
print(words)

8
{'the', 'dog', 'brown', 'lazy', 'over', 'jumped', 'quick', 'fox'}


In [99]:
# integer encode the document 
result = one_hot(text, round(vocab_size*1.3)) 
print(result)

[2, 5, 1, 1, 2, 2, 2, 4, 7]


### Hash Encoding with hashing trick

In [78]:
from keras.preprocessing.text import hashing_trick 
from keras.preprocessing.text import text_to_word_sequence 

# define the document 
text = "The quick brown fox jumped over the lazy dog."

In [79]:
# estimate the size of the vocabulary 
words = set(text_to_word_sequence(text)) 
vocab_size = len(words) 
print(vocab_size) 
print(words)

8
{'the', 'dog', 'brown', 'lazy', 'over', 'jumped', 'quick', 'fox'}


In [80]:
# integer encode the document 
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5') 
print(result)

[6, 4, 1, 2, 7, 5, 6, 2, 6]


### Keras: Tokenizer API

In [141]:
from keras.preprocessing.text import Tokenizer 

# define 5 documents 
docs = ['Well done!'
        , 'Good work'
        , 'Great great effort'
        , 'nice work'
        , 'Excellent!'
       ] 

In [142]:
# create the tokenizer 
t = Tokenizer() 

# fit the tokenizer on the documents 
t.fit_on_texts(docs)

In [143]:
# Dictionary of words and their count
t.word_docs

defaultdict(int,
            {'done': 1,
             'well': 1,
             'work': 2,
             'good': 1,
             'effort': 1,
             'great': 1,
             'nice': 1,
             'excellent': 1})

In [144]:
t.word_index

{'work': 1,
 'great': 2,
 'well': 3,
 'done': 4,
 'good': 5,
 'effort': 6,
 'nice': 7,
 'excellent': 8}

In [145]:
t.document_count

5

In [148]:
# integer encode documents 
# mode could be binary - count - tfidf - freq
encoded_docs = t.texts_to_matrix(docs, mode='tfidf') 
print(encoded_docs)

[[0.         0.         0.         1.25276297 1.25276297 0.
  0.         0.         0.        ]
 [0.         0.98082925 0.         0.         0.         1.25276297
  0.         0.         0.        ]
 [0.         0.         2.12111209 0.         0.         0.
  1.25276297 0.         0.        ]
 [0.         0.98082925 0.         0.         0.         0.
  0.         1.25276297 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.25276297]]
