In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

### Bag of Words Representation

In [15]:
sentences = ['this is a sample sample sentence', 'this sentence is another example', 'another example sentence']

In [16]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(sentences)

In [32]:
pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out() )

Unnamed: 0,another,example,is,sample,sentence,this
0,0,0,1,2,1,1
1,1,1,1,0,1,1
2,1,1,0,0,1,0


In [33]:
print(bow.toarray())
print(vectorizer.get_feature_names_out())

[[0 0 1 2 1 1]
 [1 1 1 0 1 1]
 [1 1 0 0 1 0]]
['another' 'example' 'is' 'sample' 'sentence' 'this']


In [None]:
# Thus each sentence has been converted into a vector

# 'this is a sample sample sentence' => [0, 0, 1, 2, 1, 1]
# 'this sentence is another example' => [1, 1, 1, 0, 1, 1]
# 'another example sentence' => [1, 1, 0, 0, 1, 0]

### One Hot Encoding (word level)

In [None]:
sentences = ['this is a sample sample sentence', 'this sentence is another example', 'another example sentence']

In [20]:
unique_words = sorted(set(" ".join(sentences).split()))
# word_list = [sentence.split() for sentence in sentences]

In [36]:
encoder = OneHotEncoder(sparse=False)

encoder.fit([[word] for word in unique_words])

print(unique_words)
for i, sentence in enumerate(sentences):
    words = sentence.split()
    encoded = encoder.transform([[word] for word in words])
    print(f"Sentence {i+1}")
    #print(encoded)
    df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
    df.insert(0, "Word", words)
    print(df)

['a', 'another', 'example', 'is', 'sample', 'sentence', 'this']
Sentence 1
       Word  x0_a  x0_another  x0_example  x0_is  x0_sample  x0_sentence  \
0      this   0.0         0.0         0.0    0.0        0.0          0.0   
1        is   0.0         0.0         0.0    1.0        0.0          0.0   
2         a   1.0         0.0         0.0    0.0        0.0          0.0   
3    sample   0.0         0.0         0.0    0.0        1.0          0.0   
4    sample   0.0         0.0         0.0    0.0        1.0          0.0   
5  sentence   0.0         0.0         0.0    0.0        0.0          1.0   

   x0_this  
0      1.0  
1      0.0  
2      0.0  
3      0.0  
4      0.0  
5      0.0  
Sentence 2
       Word  x0_a  x0_another  x0_example  x0_is  x0_sample  x0_sentence  \
0      this   0.0         0.0         0.0    0.0        0.0          0.0   
1  sentence   0.0         0.0         0.0    0.0        0.0          1.0   
2        is   0.0         0.0         0.0    1.0        0.0   

In [None]:
# this => [0, 0, 0, 0, 0, 0, 1]
# another => [0, 1, 0, 0, 0, 0, 0]
# sentence => [0, 0, 0, 0, 0, 1, 0]

Conceptually, Bag of Words representation can be viewed as  the summation of one hot vectors per word in a sentence

We can one vector for the entire sentence (hence each sentence is 2d), or one vector for each word in a sentence (hence each sentence is 3d) The first approach is used for classical ML models, whereas second approach for Deep Learning models like CNN, RNN, LSTM

In [None]:
## Sentence level one hot vs token level one hot