In [1]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
documents = [
    "Machine learning is transforming the world.",
    "Natural Language Processing is a part of AI.",
    "TF-IDF and Bag-of-Words are used for text feature extraction.",
    "Text preprocessing is essential in NLP tasks."
]

In [8]:
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [9]:
cleaned_docs = [preprocess(doc) for doc in documents]

In [10]:
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(cleaned_docs)

In [11]:
print("\n🔹 Bag-of-Words Vocabulary:")
print(bow_vectorizer.get_feature_names_out())

print("\n🔹 BoW Feature Matrix:")
print(bow_matrix.toarray())


🔹 Bag-of-Words Vocabulary:
['ai' 'essential' 'extraction' 'feature' 'language' 'learning' 'machine'
 'natural' 'nlp' 'part' 'preprocessing' 'processing' 'task' 'text'
 'transforming' 'used' 'world']

🔹 BoW Feature Matrix:
[[0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1]
 [1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0]]


In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_docs)

In [13]:
print("\n🔹 TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\n🔹 TF-IDF Feature Matrix:")
print(tfidf_matrix.toarray())


🔹 TF-IDF Vocabulary:
['ai' 'essential' 'extraction' 'feature' 'language' 'learning' 'machine'
 'natural' 'nlp' 'part' 'preprocessing' 'processing' 'task' 'text'
 'transforming' 'used' 'world']

🔹 TF-IDF Feature Matrix:
[[0.         0.         0.         0.         0.         0.5
  0.5        0.         0.         0.         0.         0.
  0.         0.         0.5        0.         0.5       ]
 [0.4472136  0.         0.         0.         0.4472136  0.
  0.         0.4472136  0.         0.4472136  0.         0.4472136
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.52547275 0.52547275 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.41428875 0.         0.52547275 0.        ]
 [0.         0.46516193 0.         0.         0.         0.
  0.         0.         0.46516193 0.         0.46516193 0.
  0.46516193 0.36673901 0.         0.         0.        ]]
