In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# --- Preprocessing Function (from Day 8) ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return " ".join(lemmatized_tokens) # Join back into a string for CountVectorizer

# Sample documents (sentences)
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Dogs are loyal animals and are good pets.",
    "Foxes are clever animals, but sometimes lazy.",
    "A quick brown dog is a good pet."
]

print("Original Documents:")
for i, doc in enumerate(documents):
    print(f"Doc {i+1}: {doc}")

# Apply preprocessing to each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]
print("\nPreprocessed Documents:")
for i, doc in enumerate(preprocessed_documents):
    print(f"Doc {i+1}: {doc}")

# 1. Initialize CountVectorizer
# min_df (minimum document frequency) ignores words that appear in too few documents (e.g., specific to only one doc)
# max_df (maximum document frequency) ignores words that appear in too many documents (e.g., common across all docs)
vectorizer = CountVectorizer()

# 2. Fit the vectorizer to the preprocessed documents and transform them
# fit(): Learns the vocabulary from the documents
# transform(): Converts the documents into numerical vectors based on the learned vocabulary
X_bow = vectorizer.fit_transform(preprocessed_documents)

# 3. Print the vocabulary (words it learned)
print(f"\nVocabulary (Feature Names):\n{vectorizer.get_feature_names_out()}")

# 4. Print the Bag-of-Words matrix (sparse matrix, convert to dense for easy viewing)
# Each row represents a document, each column represents a word from the vocabulary,
# and the value is the count of that word in the document.
print(f"\nBag-of-Words Matrix (Dense):\n{X_bow.toarray()}")

# Let's see which column corresponds to which word for a clear view
import pandas as pd
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
print(f"\nBag-of-Words DataFrame:\n{df_bow}")

Original Documents:
Doc 1: The quick brown fox jumps over the lazy dog.
Doc 2: Dogs are loyal animals and are good pets.
Doc 3: Foxes are clever animals, but sometimes lazy.
Doc 4: A quick brown dog is a good pet.

Preprocessed Documents:
Doc 1: quick brown fox jump lazy dog
Doc 2: dog loyal animal good pet
Doc 3: fox clever animal sometimes lazy
Doc 4: quick brown dog good pet

Vocabulary (Feature Names):
['animal' 'brown' 'clever' 'dog' 'fox' 'good' 'jump' 'lazy' 'loyal' 'pet'
 'quick' 'sometimes']

Bag-of-Words Matrix (Dense):
[[0 1 0 1 1 0 1 1 0 0 1 0]
 [1 0 0 1 0 1 0 0 1 1 0 0]
 [1 0 1 0 1 0 0 1 0 0 0 1]
 [0 1 0 1 0 1 0 0 0 1 1 0]]

Bag-of-Words DataFrame:
   animal  brown  clever  dog  fox  good  jump  lazy  loyal  pet  quick  \
0       0      1       0    1    1     0     1     1      0    0      1   
1       1      0       0    1    0     1     0     0      1    1      0   
2       1      0       1    0    1     0     0     1      0    0      0   
3       0      1       0    1 

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using the same preprocessed_documents from Assignment 1

# 1. Initialize TfidfVectorizer
# Like CountVectorizer, it learns vocabulary and transforms.
# It also applies TF-IDF weighting.
tfidf_vectorizer = TfidfVectorizer()

# 2. Fit and transform the preprocessed documents
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_documents)

# 3. Print the vocabulary (should be the same as BoW if using the same settings)
print(f"TF-IDF Vocabulary (Feature Names):\n{tfidf_vectorizer.get_feature_names_out()}")

# 4. Print the TF-IDF matrix (sparse matrix, convert to dense for easy viewing)
# Higher values indicate words that are more important to that specific document in the collection.
print(f"\nTF-IDF Matrix (Dense):\n{X_tfidf.toarray()}")

# Let's see which column corresponds to which word for a clear view
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(f"\nTF-IDF DataFrame:\n{df_tfidf}")

TF-IDF Vocabulary (Feature Names):
['animal' 'brown' 'clever' 'dog' 'fox' 'good' 'jump' 'lazy' 'loyal' 'pet'
 'quick' 'sometimes']

TF-IDF Matrix (Dense):
[[0.         0.39954636 0.         0.32346721 0.39954636 0.
  0.5067739  0.39954636 0.         0.         0.39954636 0.        ]
 [0.43584673 0.         0.         0.35285549 0.         0.43584673
  0.         0.         0.55281632 0.43584673 0.         0.        ]
 [0.40104275 0.         0.50867187 0.         0.40104275 0.
  0.         0.40104275 0.         0.         0.         0.50867187]
 [0.         0.46346838 0.         0.3752176  0.         0.46346838
  0.         0.         0.         0.46346838 0.46346838 0.        ]]

TF-IDF DataFrame:
     animal     brown    clever       dog       fox      good      jump  \
0  0.000000  0.399546  0.000000  0.323467  0.399546  0.000000  0.506774   
1  0.435847  0.000000  0.000000  0.352855  0.000000  0.435847  0.000000   
2  0.401043  0.000000  0.508672  0.000000  0.401043  0.000000  0.000