# Imports

In [1]:
import string

In [2]:
import nltk

# Words in Space

In [3]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

In [4]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

## Frequency Vectors

### With NLTK

In [5]:
def nltk_frequency_vectorize(corpus):

    # The NLTK frequency vectorize method
    from collections import defaultdict

    def vectorize(doc):
        features = defaultdict(int)

        for token in tokenize(doc):
            features[token] += 1

        return features

    return map(vectorize, corpus)

In [6]:
list(nltk_frequency_vectorize(corpus))

[defaultdict(int,
             {'the': 2,
              'eleph': 1,
              'sneez': 1,
              'at': 1,
              'sight': 1,
              'of': 1,
              'potato': 1}),
 defaultdict(int,
             {'bat': 2,
              'can': 1,
              'see': 2,
              'via': 1,
              'echoloc': 1,
              'the': 1,
              'sight': 1,
              'sneez': 1}),
 defaultdict(int,
             {'wonder': 1,
              'she': 1,
              'open': 1,
              'the': 2,
              'door': 1,
              'to': 1,
              'studio': 1})]

### In Scikit-Learn

In [7]:
def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus)

In [8]:
sklearn_frequency_vectorize(corpus).todense()

matrix([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0],
        [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1]])

### The Gensim way

In [9]:
def gensim_frequency_vectorize(corpus):
    # The Gensim frequency vectorize method
    import gensim
    
    tokenized_corpus = [list(tokenize(doc)) for doc in corpus]
    id2word = gensim.corpora.Dictionary(tokenized_corpus)
    return [id2word.doc2bow(doc) for doc in tokenized_corpus]

In [10]:
gensim_frequency_vectorize(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)],
 [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]

## One-Hot Encoding

### With NLTK

In [11]:
def nltk_one_hot_vectorize(corpus):
    # The NLTK one hot vectorize method
    def vectorize(doc):
        return {
            token: True
            for token in tokenize(doc)
        }

    return map(vectorize, corpus)

In [12]:
list(nltk_one_hot_vectorize(corpus))

[{'the': True,
  'eleph': True,
  'sneez': True,
  'at': True,
  'sight': True,
  'of': True,
  'potato': True},
 {'bat': True,
  'can': True,
  'see': True,
  'via': True,
  'echoloc': True,
  'the': True,
  'sight': True,
  'sneez': True},
 {'wonder': True,
  'she': True,
  'open': True,
  'the': True,
  'door': True,
  'to': True,
  'studio': True}]

### In Scikit-Learn

In [13]:
def sklearn_one_hot_vectorize_v0(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq    = CountVectorizer()
    vectors = freq.fit_transform(corpus)
    
    onehot  = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    return vectors

In [14]:
sklearn_one_hot_vectorize_v0(corpus)

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

In [15]:
%%timeit
sklearn_one_hot_vectorize_v0(corpus)

124 µs ± 878 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
def sklearn_one_hot_vectorize_v1(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer

    freq    = CountVectorizer(binary=True)
    vectors = freq.fit_transform(corpus)

    return vectors

In [17]:
sklearn_one_hot_vectorize_v1(corpus).todense()

matrix([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
        [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

In [18]:
%%timeit
sklearn_one_hot_vectorize_v1(corpus)

83.7 µs ± 430 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### The Gensim way

In [19]:
def gensim_one_hot_vectorize(corpus):
    # The Gensim one hot vectorize method
    import gensim
    import numpy as np

    corpus  = [list(tokenize(doc)) for doc in corpus]
    id2word = gensim.corpora.Dictionary(corpus)

    corpus  = [
        [(token[0], 1) for token in id2word.doc2bow(doc)]
        for doc in corpus
    ]

    return corpus

In [20]:
gensim_one_hot_vectorize(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]

## Term Frequency–Inverse Document Frequency

### With NLTK

In [21]:
def nltk_tfidf_vectorize(corpus):

    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }

In [22]:
list(nltk_tfidf_vectorize(corpus))

[{'the': 0.0,
  'eleph': 0.13732653608351372,
  'sneez': 0.05068313851352055,
  'at': 0.13732653608351372,
  'sight': 0.05068313851352055,
  'of': 0.13732653608351372,
  'potato': 0.13732653608351372},
 {'bat': 0.21972245773362198,
  'can': 0.10986122886681099,
  'see': 0.21972245773362198,
  'via': 0.10986122886681099,
  'echoloc': 0.10986122886681099,
  'the': 0.0,
  'sight': 0.04054651081081644,
  'sneez': 0.04054651081081644},
 {'wonder': 0.13732653608351372,
  'she': 0.13732653608351372,
  'open': 0.13732653608351372,
  'the': 0.0,
  'door': 0.13732653608351372,
  'to': 0.13732653608351372,
  'studio': 0.13732653608351372}]

### In Scikit-Learn

In [23]:
def sklearn_tfidf_vectorize(corpus):
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer()
    return tfidf.fit_transform(corpus)

In [24]:
sklearn_tfidf_vectorize(corpus).todense()

matrix([[0.37867627, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.37867627, 0.37867627, 0.        , 0.37867627,
         0.        , 0.        , 0.28799306, 0.        , 0.37867627,
         0.        , 0.44730461, 0.        , 0.        , 0.        ],
        [0.        , 0.30251368, 0.30251368, 0.30251368, 0.        ,
         0.30251368, 0.        , 0.        , 0.        , 0.        ,
         0.60502736, 0.        , 0.23006945, 0.30251368, 0.        ,
         0.        , 0.17866945, 0.        , 0.30251368, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.36772387,
         0.        , 0.        , 0.        , 0.36772387, 0.        ,
         0.        , 0.36772387, 0.        , 0.        , 0.        ,
         0.36772387, 0.43436728, 0.36772387, 0.        , 0.36772387]])

### The Gensim way

In [25]:
def gensim_tfidf_vectorize(corpus):
    import gensim

    corpus  = [list(tokenize(doc)) for doc in corpus]
    lexicon = gensim.corpora.Dictionary(corpus)

    tfidf   = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)
    vectors = [tfidf[lexicon.doc2bow(vector)] for vector in corpus]

    return vectors

In [26]:
gensim_tfidf_vectorize(corpus)

[[(0, 0.4837965208957426),
  (1, 0.4837965208957426),
  (2, 0.4837965208957426),
  (3, 0.4837965208957426),
  (4, 0.17855490118826325),
  (5, 0.17855490118826325)],
 [(4, 0.10992597952954358),
  (5, 0.10992597952954358),
  (7, 0.5956913654963344),
  (8, 0.2978456827481672),
  (9, 0.2978456827481672),
  (10, 0.5956913654963344),
  (11, 0.2978456827481672)],
 [(12, 0.408248290463863),
  (13, 0.408248290463863),
  (14, 0.408248290463863),
  (15, 0.408248290463863),
  (16, 0.408248290463863),
  (17, 0.408248290463863)]]

## Distributed Representation

### The Gensim way

In [27]:
def gensim_doc2vec_vectorize(corpus):
    from gensim.models.doc2vec import TaggedDocument, Doc2Vec

    corpus = [list(tokenize(doc)) for doc in corpus]
    docs   = [
        TaggedDocument(words, ['d{}'.format(idx)])
        for idx, words in enumerate(corpus)
    ]
    model = Doc2Vec(docs, vector_size=5, min_count=0)
    return model.dv

In [28]:
print(gensim_doc2vec_vectorize(corpus)[0])

[-0.10468323 -0.11976369 -0.19807316  0.17087035  0.0711579 ]
