In [23]:
# Load libraries
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [24]:
# Create text
train = ['I love India. India!', 'India is the best', 'India beats USA in democracy']

In [25]:
# Create bag of words of vocabulary
vect = CountVectorizer()

vect.fit(train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
# Print vocabulary 
print(vect.vocabulary_)

{'love': 6, 'india': 4, 'is': 5, 'the': 7, 'best': 1, 'beats': 0, 'usa': 8, 'in': 3, 'democracy': 2}


In [27]:
# get the feature names
print(vect.get_feature_names())

['beats', 'best', 'democracy', 'in', 'india', 'is', 'love', 'the', 'usa']


In [28]:
# transform the training data into a document-term matrix
train_dtm = vect.transform(train)

# it is sparse matrix
print(type(train_dtm))
print(train_dtm)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4)	2
  (0, 6)	1
  (1, 1)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (2, 0)	1
  (2, 2)	1
  (2, 3)	1
  (2, 4)	1
  (2, 8)	1


In [29]:
# convert the sparse matrix to dense matrix
train_dtm.toarray()

array([[0, 0, 0, 0, 2, 0, 1, 0, 0],
       [0, 1, 0, 0, 1, 1, 0, 1, 0],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]], dtype=int64)

In [30]:
# Prediction

# sample test pattern for model testing
test = ['India is China super great Noble Cause tfidf']

In [31]:
# we only need to transform the test pattern
test_dtm = vect.transform(test)

In [32]:
print(type(test_dtm))
print(test_dtm.toarray())
print(vect.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 0 1 1 0 0 0]]
['beats', 'best', 'democracy', 'in', 'india', 'is', 'love', 'the', 'usa']


In [11]:
# binary 

In [33]:
# Create bag of words of vocabulary
vect = CountVectorizer(binary=True)

vect.fit(train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
# Print vocabulary 
print(vect.vocabulary_)

{'love': 6, 'india': 4, 'is': 5, 'the': 7, 'best': 1, 'beats': 0, 'usa': 8, 'in': 3, 'democracy': 2}


In [35]:
# get the feature names
print(vect.get_feature_names())

['beats', 'best', 'democracy', 'in', 'india', 'is', 'love', 'the', 'usa']


In [36]:
# transform the training data into a document-term matrix
train_dtm = vect.transform(train)

# it is sparse matrix
print(type(train_dtm))
print(train_dtm)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4)	1
  (0, 6)	1
  (1, 1)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (2, 0)	1
  (2, 2)	1
  (2, 3)	1
  (2, 4)	1
  (2, 8)	1


In [37]:
# convert the sparse matrix to dense matrix
train_dtm.toarray()

array([[0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 1, 1, 0, 1, 0],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]], dtype=int64)

In [17]:
# Running the example encodes the sample document as a 20-element sparse array.

# The values of the encoded document correspond to normalized word counts by default in the 
# range of -1 to 1, but could be made simple integer counts by changing the default configuration.

In [38]:
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]


In [39]:
# create the transform
vectorizer = HashingVectorizer(n_features=20)


In [40]:
# encode document
vector = vectorizer.transform(text)


In [41]:
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


In [42]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)
print(X.toarray())

(4, 16)
[[-0.57735027  0.          0.          0.          0.          0.
   0.          0.         -0.57735027  0.          0.          0.
   0.          0.57735027  0.          0.        ]
 [-0.81649658  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.40824829
   0.          0.40824829  0.          0.        ]
 [ 0.          0.          0.          0.         -0.70710678  0.70710678
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [-0.57735027  0.          0.          0.          0.          0.
   0.          0.         -0.57735027  0.          0.          0.
   0.          0.57735027  0.          0.        ]]
