In [4]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]

processed_docs = [document.lower().replace(".", "") for document in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [5]:
vocab = {}
count = 0 

for doc in processed_docs:
    for word in doc.split(): 
        if word not in vocab: 
            count += 1
            vocab[word] = count 

vocab

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}

In [9]:
def get_one_hot_vector(doc):
    one_hot_encoded = []
    for word in doc.split(): 
        one_hot = [0]*len(vocab)
        if word in vocab:
            one_hot[vocab[word] - 1] = 1 # -1 because indexing starts from 0, vocab values from 1 
        one_hot_encoded.append(one_hot)
    return one_hot_encoded

In [11]:
print(processed_docs[1])
get_one_hot_vector(processed_docs[1])

man bites dog


[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]

In [18]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0] + data[1] + data[2] + data[3] 

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:",integer_encoded)

onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("OneHot Encoded:",onehot_encoded)

Label Encoded: [1 0 4 4 0 1 1 2 5 4 2 3]
OneHot Encoded: [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


In [27]:
from sklearn.feature_extraction.text import CountVectorizer 

count_vect = CountVectorizer()
bow_rep = count_vect.fit_transform(processed_docs)
print('vocab', count_vect.vocabulary_)
print(bow_rep[0].toarray())

vocab {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
[[1 1 0 0 1 0]]


In [28]:
count_vect = CountVectorizer(binary=True)
bow_rep_bin = count_vect.fit_transform(processed_docs)
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


In [29]:
count_vect = CountVectorizer(ngram_range=(1,3))
bow_rep = count_vect.fit_transform(processed_docs)
print("Our vocabulary: ", count_vect.vocabulary_)
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}
Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [30]:
######## TF IDF ########## 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)
print(tfidf.idf_) #IDF for all words in the vocabulary
print(tfidf.get_feature_names()) #All words in the vocabulary.

temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

[1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
['bites', 'dog', 'eats', 'food', 'man', 'meat']
Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]
