#### Bag of Words Implementation

Parts
- Tokenize the sentences and get the vocab size
- Create the feature matrix with zeros and word-position mapping in dict
- Convert each tokenized sentence into BoW array

##### Step-1 Tokenize

In [5]:
from nltk.tokenize import word_tokenize
import numpy as np

data = ['She loves pizza, pizza is delicious.','She is a good person.','good people are the best.']

vocab = set()
def tokenize(doc):
    sent = []
    tokens = word_tokenize(doc)
    for i in tokens:
        if i.isalnum():
            sent.append(i.lower())
            vocab.add(i.lower())
    return sent

corpus = []
for i in data:
    corpus.append(tokenize(i))

In [11]:
vocab = list(vocab)
vocab

['delicious',
 'people',
 'pizza',
 'are',
 'is',
 'person',
 'best',
 'good',
 'loves',
 'the',
 'she',
 'a']

In [7]:
corpus

[['she', 'loves', 'pizza', 'pizza', 'is', 'delicious'],
 ['she', 'is', 'a', 'good', 'person'],
 ['good', 'people', 'are', 'the', 'best']]

#### Step -2 Feature Matrix and mapping dict

In [14]:
# mapping
mapping = {}

for i in range(0,len(vocab)):
    mapping[vocab[i]] = i

mapping

{'delicious': 0,
 'people': 1,
 'pizza': 2,
 'are': 3,
 'is': 4,
 'person': 5,
 'best': 6,
 'good': 7,
 'loves': 8,
 'the': 9,
 'she': 10,
 'a': 11}

In [15]:
#feature matrix is just a stacking of bow of each sentences
bow = np.zeros((len(corpus), len(vocab)))
bow

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

#### Step 3 Fill feature matrix with BoW count

In [16]:
def BoW(corpus, bow, mapping):
    
    for i in range(0,len(corpus)):
        for j in corpus[i]:
            bow[i][mapping[j]] +=1
    
    return bow 

bow = BoW(corpus, bow, mapping)
bow            

array([[1., 0., 2., 0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.],
       [0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.]])