# BoW

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "I love NLP",
    "NLP is fun",
    "I enjoy learning NLP",
    "NLP is fun so I enjoy NLP"
]

# Create the CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the documents into BoW vectors
bow_matrix = vectorizer.fit_transform(documents)

# Convert the matrix to an array for better readability
bow_array = bow_matrix.toarray()

# Print the results
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Bag of Words Matrix:\n", bow_array)

Vocabulary: ['enjoy' 'fun' 'is' 'learning' 'love' 'nlp' 'so']
Bag of Words Matrix:
 [[0 0 0 0 1 1 0]
 [0 1 1 0 0 1 0]
 [1 0 0 1 0 1 0]
 [1 1 1 0 0 2 1]]


# Binary BoW

In [10]:
from sklearn.feature_extraction.text import CountVectorizer


documents = [
    "I love NLP",
    "NLP is fun",
    "I enjoy learning NLP",
    "NLP is fun so I enjoy NLP"
]


# Create the CountVectorizer object with binary=True
vectorizer = CountVectorizer(binary=True)

# Fit and transform the documents into Binary BoW vectors
binary_bow_matrix = vectorizer.fit_transform(documents)

# Convert the matrix to an array for better readability
binary_bow_array = binary_bow_matrix.toarray()

# Print the results
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Binary Bag of Words Matrix:\n", binary_bow_array)

Vocabulary: ['enjoy' 'fun' 'is' 'learning' 'love' 'nlp' 'so']
Binary Bag of Words Matrix:
 [[0 0 0 0 1 1 0]
 [0 1 1 0 0 1 0]
 [1 0 0 1 0 1 0]
 [1 1 1 0 0 1 1]]


# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "I love NLP",
    "NLP is fun",
    "I enjoy learning NLP",
    "I hate spam",
    "Spam emails are annoying"
]
# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the matrix to an array for better readability
tfidf_array = tfidf_matrix.toarray()

# Print the results
print("Vocabulary:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_array)

Vocabulary: ['annoying' 'are' 'emails' 'enjoy' 'fun' 'hate' 'is' 'learning' 'love'
 'nlp' 'spam']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.83088075 0.55645052 0.        ]
 [0.         0.         0.         0.         0.63907044 0.
  0.63907044 0.         0.         0.42799292 0.        ]
 [0.         0.         0.         0.63907044 0.         0.
  0.         0.63907044 0.         0.42799292 0.        ]
 [0.         0.         0.         0.         0.         0.77828292
  0.         0.         0.         0.         0.62791376]
 [0.52335825 0.52335825 0.52335825 0.         0.         0.
  0.         0.         0.         0.         0.42224214]]
