Traditional Method of Word Embeddings

*   One-Hot Encoding
*   Bag of Word(BOW)
*   Term Frequency - inverse document frequency(TF-IDF)

In [None]:
# One hot Encoding
def one_hot_encode(text):
    words = text.split()
    vocabulary = set(words)
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    one_hot_encoded = []
    for word in words:
        one_hot_vector = [0] * len(vocabulary)
        one_hot_vector[word_to_index[word]] = 1
        one_hot_encoded.append(one_hot_vector)
    return one_hot_encoded, word_to_index, vocabulary

example_text = "cat in the hat dog on the mat bird in the tree"

one_hot_encoded, word_to_index, vocabulary = one_hot_encode(example_text)

print("Vocabulary:", vocabulary)
print("Word to Index Mapping:", word_to_index)
print("One-Hot Encoded Matrix:")
for word, encoding in zip(example_text.split(), one_hot_encoded):
    print(f"{word}: {encoding}")

Vocabulary: {'dog', 'cat', 'the', 'tree', 'mat', 'hat', 'bird', 'in', 'on'}
Word to Index Mapping: {'dog': 0, 'cat': 1, 'the': 2, 'tree': 3, 'mat': 4, 'hat': 5, 'bird': 6, 'in': 7, 'on': 8}
One-Hot Encoded Matrix:
cat: [0, 1, 0, 0, 0, 0, 0, 0, 0]
in: [0, 0, 0, 0, 0, 0, 0, 1, 0]
the: [0, 0, 1, 0, 0, 0, 0, 0, 0]
hat: [0, 0, 0, 0, 0, 1, 0, 0, 0]
dog: [1, 0, 0, 0, 0, 0, 0, 0, 0]
on: [0, 0, 0, 0, 0, 0, 0, 0, 1]
the: [0, 0, 1, 0, 0, 0, 0, 0, 0]
mat: [0, 0, 0, 0, 1, 0, 0, 0, 0]
bird: [0, 0, 0, 0, 0, 0, 1, 0, 0]
in: [0, 0, 0, 0, 0, 0, 0, 1, 0]
the: [0, 0, 1, 0, 0, 0, 0, 0, 0]
tree: [0, 0, 0, 1, 0, 0, 0, 0, 0]


In [None]:
# Bag of Word(Bow)

from sklearn.feature_extraction.text import CountVectorizer
documents = ["This is the first document.",
             "This document is the second document.",
              "And this is the third one.",
             "Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag-of-Words Matrix:")
print(X.toarray())
print("Vocabulary (Feature Names):", feature_names)

Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary (Feature Names): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [None]:
#Term frequency-inverse document frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [ "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step." ]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

for doc_index, doc in enumerate(documents):
    feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
    tfidf_doc_values = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}

for doc_index, values in tfidf_values.items():
    print(f"Document {doc_index + 1}:")
    for word, tfidf_value in values.items():
        print(f"{word}: {tfidf_value}")
    print("\n")

Document 1:
the: 0.6030226891555273
quick: 0.30151134457776363
brown: 0.30151134457776363
fox: 0.30151134457776363
jumps: 0.30151134457776363
over: 0.30151134457776363
lazy: 0.30151134457776363
dog: 0.30151134457776363


Document 2:
journey: 0.3535533905932738
of: 0.3535533905932738
thousand: 0.3535533905932738
miles: 0.3535533905932738
begins: 0.3535533905932738
with: 0.3535533905932738
single: 0.3535533905932738
step: 0.3535533905932738


