## Bag of Words can be build in three steps:

1. Split and cleaning data ( preprocessing data )
2. Create the vocab, make a list of words in the document.
2. Create a document vector and score the words in this document using a fixed length document representation
2. The score can be by:
        * Count
        * TF-IDF
        * Frequency
        * binary

In [1]:
text = '''It was the best of times,
it was the age of wisdom,
it's the worst of times,
it was a beautiful mess'''

In [2]:
tokens = text.split()
print(tokens)

['It', 'was', 'the', 'best', 'of', 'times,', 'it', 'was', 'the', 'age', 'of', 'wisdom,', "it's", 'the', 'worst', 'of', 'times,', 'it', 'was', 'a', 'beautiful', 'mess']


In [3]:
from collections import Counter

Vocab = Counter()  
Vocab.update(tokens) # counter and update the number of words in the vocabulary
Vocab = set(Vocab)
print(Vocab)
print(len(Vocab))

{'wisdom,', 'age', 'best', 'it', 'worst', 'mess', 'beautiful', "it's", 'was', 'a', 'It', 'of', 'times,', 'the'}
14


In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(Vocab)



Using TensorFlow backend.


In [5]:
text1 = ['it was the only thing to do']
modes = ['count', 'tfidf', 'binary', 'freq'] # the four score methods used in bag of words

for mode in modes:
    text_vec = tokenizer.texts_to_matrix(text1, mode = mode)   # create the document vector
    print(f'mode: {mode} ')
    print(text_vec)
    print()
    

mode: count 
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]]

mode: tfidf 
[[0.         1.73460106 0.         0.         0.         0.
  0.         0.         0.         2.07944154 0.         0.
  0.         2.07944154]]

mode: binary 
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]]

mode: freq 
[[0.         0.33333333 0.         0.         0.         0.
  0.         0.         0.         0.33333333 0.         0.
  0.         0.33333333]]



In [6]:
# For using a bag of words if n greater than 1:

from itertools import combinations

vocab2 = Counter()

vocab2.update(Counter(combinations(tokens, 2)))  # 2-gram 

print(set(vocab2))

{('of', 'worst'), ('wisdom,', 'beautiful'), ("it's", 'the'), ('worst', 'times,'), ('times,', 'wisdom,'), ("it's", 'times,'), ('it', 'the'), ('age', 'was'), ('times,', 'beautiful'), ('best', 'times,'), ('age', 'a'), ('of', 'a'), ('worst', 'mess'), ('of', 'was'), ('best', 'the'), ('of', 'age'), ('it', 'beautiful'), ('times,', "it's"), ('was', 'beautiful'), ('was', 'best'), ('age', 'worst'), ('wisdom,', 'times,'), ('worst', 'beautiful'), ('worst', 'of'), ('the', 'times,'), ('best', 'age'), ('was', "it's"), ('the', 'worst'), ('was', 'mess'), ('It', 'times,'), ('it', 'mess'), ('of', 'the'), ('worst', 'it'), ('wisdom,', 'of'), ('times,', 'times,'), ('It', 'wisdom,'), ('of', 'beautiful'), ('wisdom,', 'worst'), ('the', 'wisdom,'), ('best', 'mess'), ('the', 'of'), ('it', 'age'), ('of', 'it'), ('the', 'a'), ('wisdom,', 'mess'), ('it', 'of'), ('best', 'worst'), ('the', 'age'), ('wisdom,', 'a'), ('best', 'of'), ('best', 'was'), ('it', 'worst'), ("it's", 'beautiful'), ('best', 'it'), ('of', 'times,