##One Hot Encoding

In [1]:
def one_hot_encoding(text):
  words = text.split()
  vocabulary = set(words)
  word_to_index = {word: index for index, word in enumerate(vocabulary)}
  one_hot_encoded = []
  for word in words:
    one_hot_vector = [0] * len(vocabulary)
    one_hot_vector[word_to_index[word]] = 1
    one_hot_encoded.append(one_hot_vector)
  return one_hot_encoded, word_to_index, vocabulary

In [2]:
sample_text = "This is a sample text for one-hot encoding."
one_hot_encoded, word_to_index, vocabulary = one_hot_encoding(sample_text)

In [3]:
print("Vocabulary:", vocabulary)
print("Word-to-Index Mapping:", word_to_index)
print("One-Hot Encoded Matrix:")
for word, encoding in zip(sample_text.split(), one_hot_encoded):
  print(f"{word}: {encoding}")

Vocabulary: {'sample', 'text', 'a', 'This', 'encoding.', 'one-hot', 'for', 'is'}
Word-to-Index Mapping: {'sample': 0, 'text': 1, 'a': 2, 'This': 3, 'encoding.': 4, 'one-hot': 5, 'for': 6, 'is': 7}
One-Hot Encoded Matrix:
This: [0, 0, 0, 1, 0, 0, 0, 0]
is: [0, 0, 0, 0, 0, 0, 0, 1]
a: [0, 0, 1, 0, 0, 0, 0, 0]
sample: [1, 0, 0, 0, 0, 0, 0, 0]
text: [0, 1, 0, 0, 0, 0, 0, 0]
for: [0, 0, 0, 0, 0, 0, 1, 0]
one-hot: [0, 0, 0, 0, 0, 1, 0, 0]
encoding.: [0, 0, 0, 0, 1, 0, 0, 0]


##Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
documents = ["This is the first document.",
			"This document is the second document.",
			"And this is the third one.",
			"Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag-of-Words Matrix:")
print(X.toarray())
print("Vocabulary (Feature Names):", feature_names)


Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary (Feature Names): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


##TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample
documents = [
	"The quick brown fox jumps over the lazy dog.",
	"A journey of a thousand miles begins with a single step.",
]

In [7]:
vectorizer = TfidfVectorizer() # Create the TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

In [8]:
for doc_index, doc in enumerate(documents):
	feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
	tfidf_doc_values = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
	tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}

In [9]:
for doc_index, values in tfidf_values.items():
	print(f"Document {doc_index + 1}:")
	for word, tfidf_value in values.items():
		print(f"{word}: {tfidf_value}")
	print("\n")

Document 1:
the: 0.6030226891555273
quick: 0.30151134457776363
brown: 0.30151134457776363
fox: 0.30151134457776363
jumps: 0.30151134457776363
over: 0.30151134457776363
lazy: 0.30151134457776363
dog: 0.30151134457776363


Document 2:
journey: 0.3535533905932738
of: 0.3535533905932738
thousand: 0.3535533905932738
miles: 0.3535533905932738
begins: 0.3535533905932738
with: 0.3535533905932738
single: 0.3535533905932738
step: 0.3535533905932738


