# Code to extract text features using the Bag of Words (BoW) model from scratch using a carefully designed tiny dataset. Also compare the BoW with One Hot Encoding text featues.

In [3]:
documents = [
    "I love cats cats are cute",           # Doc 0: 6 words, "cats" appears twice
    "dogs are good good pets",             # Doc 1: 5 words, "good" appears twice  
    "I love dogs and cats",                # Doc 2: 5 words, mix of animals
    "pets are cute and good good"          # Doc 3: 6 words, "good" appears twice
]
documents

['I love cats cats are cute',
 'dogs are good good pets',
 'I love dogs and cats',
 'pets are cute and good good']

In [4]:
for i, doc in enumerate(documents):
    word_count = len(doc.split())
    print(f"Document {i}: '{doc}' ({word_count} words)")

print(f"\nTotal documents: {len(documents)}")

Document 0: 'I love cats cats are cute' (6 words)
Document 1: 'dogs are good good pets' (5 words)
Document 2: 'I love dogs and cats' (5 words)
Document 3: 'pets are cute and good good' (6 words)

Total documents: 4


In [5]:
# Step 1: Tokenization (split each document into words)
print("\n" + "="*70)
print("STEP 1: TOKENIZATION")
print("="*70)

tokenized_docs = []
for i, doc in enumerate(documents):
    # Convert to lowercase and split by spaces
    tokens = doc.lower().split()
    tokenized_docs.append(tokens)
    print(f"Document {i}: {tokens}")

print(f"\nTokenized documents: {tokenized_docs}")


STEP 1: TOKENIZATION
Document 0: ['i', 'love', 'cats', 'cats', 'are', 'cute']
Document 1: ['dogs', 'are', 'good', 'good', 'pets']
Document 2: ['i', 'love', 'dogs', 'and', 'cats']
Document 3: ['pets', 'are', 'cute', 'and', 'good', 'good']

Tokenized documents: [['i', 'love', 'cats', 'cats', 'are', 'cute'], ['dogs', 'are', 'good', 'good', 'pets'], ['i', 'love', 'dogs', 'and', 'cats'], ['pets', 'are', 'cute', 'and', 'good', 'good']]


In [6]:
# Step 2: Build vocabulary (collect all unique words)
print("\n" + "="*70)
print("STEP 2: BUILD VOCABULARY")
print("="*70)

print("Collecting all words from all documents...")
all_words = []
for i, tokens in enumerate(tokenized_docs):
    print(f"Document {i} words: {tokens}")
    all_words.extend(tokens)  # Add all words to master list

print(f"\nAll words combined: {all_words}")
print(f"Total words (including duplicates): {len(all_words)}")



STEP 2: BUILD VOCABULARY
Collecting all words from all documents...
Document 0 words: ['i', 'love', 'cats', 'cats', 'are', 'cute']
Document 1 words: ['dogs', 'are', 'good', 'good', 'pets']
Document 2 words: ['i', 'love', 'dogs', 'and', 'cats']
Document 3 words: ['pets', 'are', 'cute', 'and', 'good', 'good']

All words combined: ['i', 'love', 'cats', 'cats', 'are', 'cute', 'dogs', 'are', 'good', 'good', 'pets', 'i', 'love', 'dogs', 'and', 'cats', 'pets', 'are', 'cute', 'and', 'good', 'good']
Total words (including duplicates): 22


In [7]:
# Get unique words and sort them (creates our vocabulary)
vocabulary = sorted(list(set(all_words)))
print(f"\nUnique words (vocabulary): {vocabulary}")
print(f"Vocabulary size: {len(vocabulary)}")

# Create word-to-index mapping for easy lookup
word_to_index = {}
for index, word in enumerate(vocabulary):
    word_to_index[word] = index

print(f"\nWord to Index mapping:")
for word, index in word_to_index.items():
    print(f"  '{word}' -> position {index}")


Unique words (vocabulary): ['and', 'are', 'cats', 'cute', 'dogs', 'good', 'i', 'love', 'pets']
Vocabulary size: 9

Word to Index mapping:
  'and' -> position 0
  'are' -> position 1
  'cats' -> position 2
  'cute' -> position 3
  'dogs' -> position 4
  'good' -> position 5
  'i' -> position 6
  'love' -> position 7
  'pets' -> position 8


In [8]:
# Step 3: Create frequency vectors for each document
print("\n" + "="*70)
print("STEP 3: CREATE FREQUENCY VECTORS")
print("="*70)

bow_vectors = []
for doc_id, tokens in enumerate(tokenized_docs):
    print(f"\nProcessing Document {doc_id}: {tokens}")
    
    # Count frequency of each word in this document
    word_freq = {}
    for word in tokens:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    
    print(f"Word frequencies: {word_freq}")



STEP 3: CREATE FREQUENCY VECTORS

Processing Document 0: ['i', 'love', 'cats', 'cats', 'are', 'cute']
Word frequencies: {'i': 1, 'love': 1, 'cats': 2, 'are': 1, 'cute': 1}

Processing Document 1: ['dogs', 'are', 'good', 'good', 'pets']
Word frequencies: {'dogs': 1, 'are': 1, 'good': 2, 'pets': 1}

Processing Document 2: ['i', 'love', 'dogs', 'and', 'cats']
Word frequencies: {'i': 1, 'love': 1, 'dogs': 1, 'and': 1, 'cats': 1}

Processing Document 3: ['pets', 'are', 'cute', 'and', 'good', 'good']
Word frequencies: {'pets': 1, 'are': 1, 'cute': 1, 'and': 1, 'good': 2}


In [9]:
bow_vectors = []
for doc_id, tokens in enumerate(tokenized_docs):
    print(f"\nProcessing Document {doc_id}: {tokens}")
    
    # Count frequency of each word in this document
    word_freq = {}
    for word in tokens:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    
    print(f"Word frequencies: {word_freq}")
    
    # Create vector based on vocabulary order
    vector = []
    for vocab_word in vocabulary:
        if vocab_word in word_freq:
            count = word_freq[vocab_word]
        else:
            count = 0
        vector.append(count)
    
    bow_vectors.append(vector)
    print(f"BoW vector: {vector}")

print(f"\nAll BoW vectors: {bow_vectors}")


Processing Document 0: ['i', 'love', 'cats', 'cats', 'are', 'cute']
Word frequencies: {'i': 1, 'love': 1, 'cats': 2, 'are': 1, 'cute': 1}
BoW vector: [0, 1, 2, 1, 0, 0, 1, 1, 0]

Processing Document 1: ['dogs', 'are', 'good', 'good', 'pets']
Word frequencies: {'dogs': 1, 'are': 1, 'good': 2, 'pets': 1}
BoW vector: [0, 1, 0, 0, 1, 2, 0, 0, 1]

Processing Document 2: ['i', 'love', 'dogs', 'and', 'cats']
Word frequencies: {'i': 1, 'love': 1, 'dogs': 1, 'and': 1, 'cats': 1}
BoW vector: [1, 0, 1, 0, 1, 0, 1, 1, 0]

Processing Document 3: ['pets', 'are', 'cute', 'and', 'good', 'good']
Word frequencies: {'pets': 1, 'are': 1, 'cute': 1, 'and': 1, 'good': 2}
BoW vector: [1, 1, 0, 1, 0, 2, 0, 0, 1]

All BoW vectors: [[0, 1, 2, 1, 0, 0, 1, 1, 0], [0, 1, 0, 0, 1, 2, 0, 0, 1], [1, 0, 1, 0, 1, 0, 1, 1, 0], [1, 1, 0, 1, 0, 2, 0, 0, 1]]


In [10]:
# Step 4: Create the BoW matrix and display nicely
print("\n" + "="*70)
print("STEP 4: FINAL BAG OF WORDS MATRIX")
print("="*70)

print("\nVOCABULARY INDEX REFERENCE:")
for i, word in enumerate(vocabulary):
    print(f"Position {i}: '{word}'")

print(f"\nBAG OF WORDS MATRIX:")
print("-" * 50)


STEP 4: FINAL BAG OF WORDS MATRIX

VOCABULARY INDEX REFERENCE:
Position 0: 'and'
Position 1: 'are'
Position 2: 'cats'
Position 3: 'cute'
Position 4: 'dogs'
Position 5: 'good'
Position 6: 'i'
Position 7: 'love'
Position 8: 'pets'

BAG OF WORDS MATRIX:
--------------------------------------------------


In [11]:
# Header row with vocabulary
header = "Doc#  "
for word in vocabulary:
    header += f"{word:>6}"
print(header)
print("-" * len(header))

# Data rows
for doc_id, vector in enumerate(bow_vectors):
    row = f"Doc{doc_id}  "
    for count in vector:
        row += f"{count:>6}"
    print(row)

Doc#     and   are  cats  cute  dogs  good     i  love  pets
------------------------------------------------------------
Doc0       0     1     2     1     0     0     1     1     0
Doc1       0     1     0     0     1     2     0     0     1
Doc2       1     0     1     0     1     0     1     1     0
Doc3       1     1     0     1     0     2     0     0     1


In [12]:
# Step 5: Detailed analysis showing how repetitive words are handled
print("\n" + "="*70)
print("STEP 5: REPETITIVE WORDS ANALYSIS")
print("="*70)

print("HOW BOW HANDLES REPETITIVE WORDS:")
print("-" * 40)

repetitive_analysis = [
    ("cats", "Document 0 has 'cats' twice -> count = 2"),
    ("good", "Documents 1&3 have 'good' twice each -> count = 2"),
    ("are", "Appears once in docs 1,2,3 -> count = 1 each"),
    ("love", "Appears once in docs 0,2 -> count = 1 each")
]


STEP 5: REPETITIVE WORDS ANALYSIS
HOW BOW HANDLES REPETITIVE WORDS:
----------------------------------------


In [13]:
for word, explanation in repetitive_analysis:
    word_idx = word_to_index[word]
    print(f"\nWord: '{word}' (position {word_idx})")
    print(f"  {explanation}")
    print("  Vector values across documents:", end=" ")
    for doc_id, vector in enumerate(bow_vectors):
        print(f"Doc{doc_id}={vector[word_idx]}", end=" ")
    print()


Word: 'cats' (position 2)
  Document 0 has 'cats' twice -> count = 2
  Vector values across documents: Doc0=2 Doc1=0 Doc2=1 Doc3=0 

Word: 'good' (position 5)
  Documents 1&3 have 'good' twice each -> count = 2
  Vector values across documents: Doc0=0 Doc1=2 Doc2=0 Doc3=2 

Word: 'are' (position 1)
  Appears once in docs 1,2,3 -> count = 1 each
  Vector values across documents: Doc0=1 Doc1=1 Doc2=0 Doc3=1 

Word: 'love' (position 7)
  Appears once in docs 0,2 -> count = 1 each
  Vector values across documents: Doc0=1 Doc1=0 Doc2=1 Doc3=0 


In [14]:
import numpy as np
import random
from collections import Counter

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("="*80)
print("SIMPLE NEURAL NETWORK FOR NEXT WORD PREDICTION USING BOW")
print("="*80)

# Same tiny dataset from previous example
documents = [
    "I love cats cats are cute",           # Doc 0: 6 words, "cats" appears twice
    "dogs are good good pets",             # Doc 1: 5 words, "good" appears twice  
    "I love dogs and cats",                # Doc 2: 5 words, mix of animals
    "pets are cute and good good"          # Doc 3: 6 words, "good" appears twice
]

print("DATASET:")
print("-" * 20)
for i, doc in enumerate(documents):
    print(f"Document {i}: '{doc}'")

# Step 1: Prepare data for neural network
print("\n" + "="*80)
print("STEP 1: DATA PREPARATION FOR NEURAL NETWORK")
print("="*80)

# Tokenize and create vocabulary
all_tokens = []
for doc in documents:
    tokens = doc.lower().split()
    all_tokens.extend(tokens)

vocabulary = sorted(list(set(all_tokens)))
vocab_size = len(vocabulary)
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

print(f"Vocabulary: {vocabulary}")
print(f"Vocabulary size: {vocab_size}")
print(f"Word to index mapping: {word_to_idx}")

# Create training sequences (context -> next word)
print(f"\nCREATING TRAINING SEQUENCES:")
print("-" * 30)

sequences = []
for doc_id, doc in enumerate(documents):
    tokens = doc.lower().split()
    print(f"\nDocument {doc_id}: {tokens}")
    
    # Create sequences of length 2: [current_word] -> [next_word]
    for i in range(len(tokens) - 1):
        current_word = tokens[i]
        next_word = tokens[i + 1]
        sequences.append((current_word, next_word))
        print(f"  '{current_word}' -> '{next_word}'")

print(f"\nTotal training sequences: {len(sequences)}")
print(f"All sequences: {sequences}")

# Step 2: Convert to BoW representation for neural network input
print("\n" + "="*80)
print("STEP 2: CONVERT TO BAG OF WORDS INPUT/OUTPUT")
print("="*80)

def word_to_bow_vector(word, vocabulary, word_to_idx):
    """Convert a single word to BoW vector (one-hot encoding)"""
    vector = [0] * len(vocabulary)
    if word in word_to_idx:
        vector[word_to_idx[word]] = 1
    return vector

def word_to_output_vector(word, vocabulary, word_to_idx):
    """Convert target word to output vector (one-hot encoding)"""
    vector = [0] * len(vocabulary)
    if word in word_to_idx:
        vector[word_to_idx[word]] = 1
    return vector

# Prepare training data
X_train = []  # Input vectors (current word as BoW)
y_train = []  # Output vectors (next word as one-hot)

print("TRAINING DATA PREPARATION:")
print("-" * 30)
for i, (current_word, next_word) in enumerate(sequences):
    input_vector = word_to_bow_vector(current_word, vocabulary, word_to_idx)
    output_vector = word_to_output_vector(next_word, vocabulary, word_to_idx)
    
    X_train.append(input_vector)
    y_train.append(output_vector)
    
    print(f"Sequence {i+1}: '{current_word}' -> '{next_word}'")
    print(f"  Input (BoW):  {input_vector}")
    print(f"  Output (target): {output_vector}")

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"\nTraining data shape:")
print(f"X_train: {X_train.shape} (sequences, vocabulary_size)")
print(f"y_train: {y_train.shape} (sequences, vocabulary_size)")

# Step 3: Simple Neural Network Implementation
print("\n" + "="*80)
print("STEP 3: SIMPLE NEURAL NETWORK IMPLEMENTATION")
print("="*80)

class SimpleNextWordPredictor:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Initialize weights randomly
        self.W1 = np.random.randn(input_size, hidden_size) * 0.1
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.1
        self.b2 = np.zeros((1, output_size))
        
        print(f"Neural Network Architecture:")
        print(f"  Input layer: {input_size} neurons (vocabulary size)")
        print(f"  Hidden layer: {hidden_size} neurons")
        print(f"  Output layer: {output_size} neurons (vocabulary size)")
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, X):
        # Forward pass
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.softmax(self.z2)
        return self.a2
    
    def backward(self, X, y, output):
        m = X.shape[0]
        
        # Backward pass
        dz2 = output - y
        dW2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self.a1 * (1 - self.a1)
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        return dW1, db1, dW2, db2
    
    def train(self, X, y, epochs=1000, learning_rate=0.1):
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)
            
            # Calculate loss (cross-entropy)
            loss = -np.mean(np.sum(y * np.log(output + 1e-15), axis=1))
            losses.append(loss)
            
            # Backward pass
            dW1, db1, dW2, db2 = self.backward(X, y, output)
            
            # Update weights
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            if epoch % 200 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
        
        return losses
    
    def predict(self, word, vocabulary, word_to_idx, idx_to_word):
        input_vector = word_to_bow_vector(word, vocabulary, word_to_idx)
        input_vector = np.array([input_vector])
        
        output = self.forward(input_vector)
        predicted_idx = np.argmax(output[0])
        predicted_word = idx_to_word[predicted_idx]
        confidence = output[0][predicted_idx]
        
        return predicted_word, confidence, output[0]

# Step 4: Train the model
print("\n" + "="*80)
print("STEP 4: TRAINING THE NEURAL NETWORK")
print("="*80)

# Create and train the model
model = SimpleNextWordPredictor(input_size=vocab_size, hidden_size=5, output_size=vocab_size)

print("\nTraining the model...")
losses = model.train(X_train, y_train, epochs=1000, learning_rate=0.5)

print(f"\nFinal loss: {losses[-1]:.4f}")

# Step 5: Test the model
print("\n" + "="*80)
print("STEP 5: TESTING THE MODEL")
print("="*80)

print("NEXT WORD PREDICTIONS:")
print("-" * 30)

test_words = ['I', 'love', 'cats', 'are', 'good', 'pets']
for word in test_words:
    if word.lower() in word_to_idx:
        predicted_word, confidence, full_output = model.predict(
            word.lower(), vocabulary, word_to_idx, idx_to_word
        )
        print(f"\nInput word: '{word}'")
        print(f"Predicted next word: '{predicted_word}' (confidence: {confidence:.3f})")
        
        # Show top 3 predictions
        top_indices = np.argsort(full_output)[-3:][::-1]
        print("Top 3 predictions:")
        for i, idx in enumerate(top_indices):
            word_pred = idx_to_word[idx]
            prob = full_output[idx]
            print(f"  {i+1}. '{word_pred}' ({prob:.3f})")

# Step 6: Demonstrate semantic understanding failures
print("\n" + "="*80)
print("STEP 6: SEMANTIC UNDERSTANDING FAILURES")
print("="*80)

print("WHY THE MODEL FAILS AT SEMANTIC UNDERSTANDING:")
print("-" * 50)

print("\n1. CONTEXT INSENSITIVITY:")
print("-" * 25)
print("The model treats each word independently.")
print("It cannot understand that 'cats' and 'dogs' are both animals.")
print("It cannot understand that 'good' can describe both pets and qualities.")

# Test semantic relationships
print(f"\nTesting semantic relationships:")
animal_words = ['cats', 'dogs', 'pets']
for word in animal_words:
    if word in word_to_idx:
        pred_word, conf, _ = model.predict(word, vocabulary, word_to_idx, idx_to_word)
        print(f"After '{word}' -> '{pred_word}' (should understand it's an animal)")

print("\n2. BAG OF WORDS LIMITATION:")
print("-" * 30)
print("BoW representation loses all word order and context:")

# Show how BoW treats semantically different sentences the same
sentence1 = "cats are good"
sentence2 = "good are cats"  
tokens1 = sentence1.split()
tokens2 = sentence2.split()

bow1 = [0] * vocab_size
bow2 = [0] * vocab_size

for token in tokens1:
    if token in word_to_idx:
        bow1[word_to_idx[token]] = 1

for token in tokens2:
    if token in word_to_idx:
        bow2[word_to_idx[token]] = 1

print(f"'{sentence1}' BoW: {bow1}")
print(f"'{sentence2}' BoW: {bow2}")
print(f"Identical representations: {bow1 == bow2}")

print("\n3. NO UNDERSTANDING OF WORD RELATIONSHIPS:")
print("-" * 45)
print("The model cannot understand:")
print("• 'cats' and 'pets' are related (cats are pets)")
print("• 'good' and 'cute' are both positive adjectives")
print("• 'I love cats' has different meaning than 'cats love I'")

# Test actual semantic understanding
print(f"\nSemantic understanding test:")
print("If the model understood semantics:")
print("• After 'I' should prefer 'love' (subject-verb pattern)")
print("• After 'cats' should prefer 'are' (noun-verb pattern)")
print("• After 'good' should prefer nouns like 'pets'")

# Show what the model actually learned
print(f"\nWhat the model actually learned (frequency patterns):")
word_next_count = {}
for current, next_word in sequences:
    if current not in word_next_count:
        word_next_count[current] = {}
    if next_word not in word_next_count[current]:
        word_next_count[current][next_word] = 0
    word_next_count[current][next_word] += 1

for word, next_words in word_next_count.items():
    print(f"  After '{word}': {next_words}")

print("\n4. STATISTICAL PATTERN VS SEMANTIC UNDERSTANDING:")
print("-" * 50)
print("The model learns statistical patterns from training data:")
print("• Which words appear after which words most frequently")
print("• But it doesn't understand WHY these patterns exist")
print("• It cannot generalize to new semantic relationships")

# Step 7: Compare with ideal semantic understanding
print("\n" + "="*80)
print("STEP 7: WHAT SEMANTIC UNDERSTANDING WOULD LOOK LIKE")
print("="*80)

print("IDEAL SEMANTIC MODEL WOULD UNDERSTAND:")
print("-" * 45)
print("✓ 'cats' and 'dogs' are both animals")
print("✓ 'good' and 'cute' are both positive descriptors")
print("✓ 'I love X' is a common pattern for expressing affection")
print("✓ Word order matters: 'I love cats' ≠ 'cats love I'")
print("✓ Context matters: 'good pets' vs 'pets good'")

print(f"\nOUR BOW MODEL LIMITATIONS:")
print("-" * 30)
print("❌ Cannot understand word relationships")
print("❌ Cannot understand grammar or syntax")
print("❌ Cannot understand context or meaning")
print("❌ Only learns surface-level statistical patterns")
print("❌ Treats all words as independent tokens")

print(f"\nWHY BOW FAILS FOR SEMANTIC UNDERSTANDING:")
print("-" * 45)
print("1. No positional information (word order lost)")
print("2. No contextual information (surrounding words ignored)")
print("3. No semantic relationships (words treated independently)")
print("4. No understanding of grammar or syntax")
print("5. Only frequency-based patterns, not meaning-based")

print("\n" + "="*80)
print("CONCLUSION")
print("="*80)
print("This demonstrates why modern NLP moved beyond BoW:")
print("• Word embeddings (Word2Vec, GloVe) capture semantic relationships")
print("• Contextual models (BERT, GPT) understand word order and context")
print("• Transformer architectures use attention to understand relationships")
print("• BoW is useful for simple tasks but fails at semantic understanding")
print("="*80)

SIMPLE NEURAL NETWORK FOR NEXT WORD PREDICTION USING BOW
DATASET:
--------------------
Document 0: 'I love cats cats are cute'
Document 1: 'dogs are good good pets'
Document 2: 'I love dogs and cats'
Document 3: 'pets are cute and good good'

STEP 1: DATA PREPARATION FOR NEURAL NETWORK
Vocabulary: ['and', 'are', 'cats', 'cute', 'dogs', 'good', 'i', 'love', 'pets']
Vocabulary size: 9
Word to index mapping: {'and': 0, 'are': 1, 'cats': 2, 'cute': 3, 'dogs': 4, 'good': 5, 'i': 6, 'love': 7, 'pets': 8}

CREATING TRAINING SEQUENCES:
------------------------------

Document 0: ['i', 'love', 'cats', 'cats', 'are', 'cute']
  'i' -> 'love'
  'love' -> 'cats'
  'cats' -> 'cats'
  'cats' -> 'are'
  'are' -> 'cute'

Document 1: ['dogs', 'are', 'good', 'good', 'pets']
  'dogs' -> 'are'
  'are' -> 'good'
  'good' -> 'good'
  'good' -> 'pets'

Document 2: ['i', 'love', 'dogs', 'and', 'cats']
  'i' -> 'love'
  'love' -> 'dogs'
  'dogs' -> 'and'
  'and' -> 'cats'

Document 3: ['pets', 'are', 'cute', 'a

In [1]:
import re
import numpy as np
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

# Input documents
documents = {
    'D1': 'This movie star had dinner with my friend.',
    'D2': 'I watch this movie on star movies.',
    'D3': 'In this movie the opening was focused on the stars in the sky.'
}

print("="*80)
print("BAG OF WORDS vs ONE-HOT ENCODING COMPARISON")
print("="*80)

print("\nINPUT DOCUMENTS:")
print("-" * 30)
for doc_id, text in documents.items():
    print(f"{doc_id}: '{text}'")

print("\n" + "="*80)
print("PART 1: BAG OF WORDS (BOW) IMPLEMENTATION")
print("="*80)

# Step 1: Preprocessing and Tokenization
print("\nSTEP 1: PREPROCESSING AND TOKENIZATION")
print("-" * 50)
processed_docs = {}
all_tokens = []

for doc_id, text in documents.items():
    # Convert to lowercase and remove punctuation
    clean_text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = clean_text.split()
    processed_docs[doc_id] = tokens
    all_tokens.extend(tokens)
    print(f"{doc_id}: {tokens}")

print(f"\nAll tokens combined: {all_tokens}")
print(f"Total tokens: {len(all_tokens)}")

# Step 2: Create Global Vocabulary
print("\nSTEP 2: CREATE GLOBAL VOCABULARY")
print("-" * 40)
vocabulary = sorted(set(all_tokens))
vocab_size = len(vocabulary)
print(f"Vocabulary: {vocabulary}")
print(f"Vocabulary size: {vocab_size}")

# Create word to index mapping
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
print(f"\nWord to Index mapping:")
for word, idx in word_to_idx.items():
    print(f"  '{word}' -> {idx}")

# Step 3: Create BoW Vectors
print("\nSTEP 3: CREATE BOW VECTORS")
print("-" * 35)
bow_vectors = {}
bow_matrix = []

for doc_id, tokens in processed_docs.items():
    word_counts = Counter(tokens)
    bow_vector = [word_counts.get(word, 0) for word in vocabulary]
    bow_vectors[doc_id] = bow_vector
    bow_matrix.append(bow_vector)
    
    print(f"\n{doc_id} word frequencies:")
    for word in vocabulary:
        count = word_counts.get(word, 0)
        if count > 0:
            print(f"  '{word}': {count}")
    print(f"{doc_id} BoW vector: {bow_vector}")

bow_matrix = np.array(bow_matrix)
print(f"\nBoW Matrix shape: {bow_matrix.shape}")
print("BoW Matrix:")
print("Docs\\Words", end="")
for word in vocabulary:
    print(f"{word:>8}", end="")
print()
for i, doc_id in enumerate(['D1', 'D2', 'D3']):
    print(f"{doc_id:>9}", end="")
    for val in bow_matrix[i]:
        print(f"{val:>8}", end="")
    print()

print("\n" + "="*80)
print("PART 2: ONE-HOT ENCODING IMPLEMENTATION")
print("="*80)

print("\nSTEP 1: ONE-HOT ENCODING CONCEPT")
print("-" * 40)
print("One-Hot Encoding creates binary vectors where:")
print("- Each dimension represents a unique word")
print("- Value = 1 if word is present, 0 if absent")
print("- No frequency information, only presence/absence")

print("\nSTEP 2: CREATE ONE-HOT VECTORS")
print("-" * 38)
onehot_vectors = {}
onehot_matrix = []

for doc_id, tokens in processed_docs.items():
    unique_tokens = set(tokens)
    onehot_vector = [1 if word in unique_tokens else 0 for word in vocabulary]
    onehot_vectors[doc_id] = onehot_vector
    onehot_matrix.append(onehot_vector)
    
    print(f"\n{doc_id} unique words: {sorted(unique_tokens)}")
    print(f"{doc_id} One-Hot vector: {onehot_vector}")

onehot_matrix = np.array(onehot_matrix)
print(f"\nOne-Hot Matrix shape: {onehot_matrix.shape}")
print("One-Hot Matrix:")
print("Docs\\Words", end="")
for word in vocabulary:
    print(f"{word:>8}", end="")
print()
for i, doc_id in enumerate(['D1', 'D2', 'D3']):
    print(f"{doc_id:>9}", end="")
    for val in onehot_matrix[i]:
        print(f"{val:>8}", end="")
    print()

print("\n" + "="*80)
print("PART 3: DETAILED COMPARISON")
print("="*80)

print("\nSIDE-BY-SIDE COMPARISON:")
print("-" * 30)
for doc_id in ['D1', 'D2', 'D3']:
    print(f"\n{doc_id}:")
    print(f"  Original: '{documents[doc_id]}'")
    print(f"  BoW:      {bow_vectors[doc_id]}")
    print(f"  One-Hot:  {onehot_vectors[doc_id]}")

print("\n" + "="*80)
print("PART 4: PERFORMANCE METRICS ANALYSIS")
print("="*80)

def calculate_metrics():
    metrics = {}
    
    # 1. Vector Dimensionality
    print("\n1. DIMENSIONALITY ANALYSIS:")
    print("-" * 30)
    print(f"Vector dimensions: {vocab_size}")
    print("Both BoW and One-Hot have same dimensionality")
    
    # 2. Sparsity Analysis
    print("\n2. SPARSITY ANALYSIS:")
    print("-" * 25)
    bow_sparsity = []
    onehot_sparsity = []
    
    for i, doc_id in enumerate(['D1', 'D2', 'D3']):
        bow_nonzero = np.count_nonzero(bow_matrix[i])
        onehot_nonzero = np.count_nonzero(onehot_matrix[i])
        bow_sparse = (vocab_size - bow_nonzero) / vocab_size * 100
        onehot_sparse = (vocab_size - onehot_nonzero) / vocab_size * 100
        
        bow_sparsity.append(bow_sparse)
        onehot_sparsity.append(onehot_sparse)
        
        print(f"{doc_id}:")
        print(f"  BoW non-zero elements: {bow_nonzero}/{vocab_size} ({100-bow_sparse:.1f}% dense)")
        print(f"  One-Hot non-zero: {onehot_nonzero}/{vocab_size} ({100-onehot_sparse:.1f}% dense)")
        print(f"  BoW sparsity: {bow_sparse:.1f}%")
        print(f"  One-Hot sparsity: {onehot_sparse:.1f}%")
    
    avg_bow_sparsity = np.mean(bow_sparsity)
    avg_onehot_sparsity = np.mean(onehot_sparsity)
    print(f"\nAverage Sparsity:")
    print(f"  BoW: {avg_bow_sparsity:.1f}%")
    print(f"  One-Hot: {avg_onehot_sparsity:.1f}%")
    
    # 3. Information Content
    print("\n3. INFORMATION CONTENT:")
    print("-" * 28)
    print("BoW advantages:")
    print("  ✓ Captures word frequency information")
    print("  ✓ Can distinguish between documents with same words but different frequencies")
    print("  ✓ Better for tasks where word importance varies with frequency")
    
    print("\nOne-Hot advantages:")
    print("  ✓ Binary representation (simpler)")
    print("  ✓ Less sensitive to word frequency variations")
    print("  ✓ Better for tasks where presence matters more than frequency")
    
    # 4. Document Similarity Analysis
    print("\n4. DOCUMENT SIMILARITY ANALYSIS:")
    print("-" * 35)
    
    # Cosine similarity for BoW
    bow_similarities = cosine_similarity(bow_matrix)
    onehot_similarities = cosine_similarity(onehot_matrix)
    
    doc_pairs = [('D1', 'D2'), ('D1', 'D3'), ('D2', 'D3')]
    indices = [(0, 1), (0, 2), (1, 2)]
    
    print("Cosine Similarities:")
    for (doc1, doc2), (i, j) in zip(doc_pairs, indices):
        bow_sim = bow_similarities[i][j]
        onehot_sim = onehot_similarities[i][j]
        print(f"  {doc1} vs {doc2}:")
        print(f"    BoW similarity: {bow_sim:.4f}")
        print(f"    One-Hot similarity: {onehot_sim:.4f}")
        print(f"    Difference: {abs(bow_sim - onehot_sim):.4f}")
    
    # 5. Memory and Computational Complexity
    print("\n5. COMPUTATIONAL COMPLEXITY:")
    print("-" * 32)
    print("Memory Usage:")
    bow_memory = bow_matrix.nbytes
    onehot_memory = onehot_matrix.nbytes
    print(f"  BoW matrix: {bow_memory} bytes")
    print(f"  One-Hot matrix: {onehot_memory} bytes")
    
    print("\nComputational Complexity:")
    print("  BoW: O(V) per document (V = vocabulary size)")
    print("  One-Hot: O(V) per document")
    print("  Both have same time complexity for creation")
    
    # 6. Use Case Analysis
    print("\n6. USE CASE RECOMMENDATIONS:")
    print("-" * 32)
    print("Use BoW when:")
    print("  • Word frequency matters (e.g., document classification)")
    print("  • Need to distinguish repetition importance")
    print("  • Working with longer documents")
    print("  • Term frequency is a meaningful signal")
    
    print("\nUse One-Hot when:")
    print("  • Only word presence/absence matters")
    print("  • Want to reduce noise from frequency variations")
    print("  • Memory efficiency is crucial (for sparse documents)")
    print("  • Working with short documents or keywords")

calculate_metrics()

print("\n" + "="*80)
print("PART 5: PRACTICAL EXAMPLE ANALYSIS")
print("="*80)

print("\nWHY DIFFERENT SIMILARITIES?")
print("-" * 30)
print("Let's analyze D1 vs D2 similarity:")
print(f"D1: '{documents['D1']}'")
print(f"D2: '{documents['D2']}'")

print(f"\nShared words: {set(processed_docs['D1']) & set(processed_docs['D2'])}")
print(f"D1 BoW vector: {bow_vectors['D1']}")
print(f"D2 BoW vector: {bow_vectors['D2']}")

# Calculate manual cosine similarity
def manual_cosine(vec1, vec2):
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    norm1 = math.sqrt(sum(a * a for a in vec1))
    norm2 = math.sqrt(sum(b * b for b in vec2))
    return dot_product / (norm1 * norm2) if norm1 * norm2 != 0 else 0

bow_sim_manual = manual_cosine(bow_vectors['D1'], bow_vectors['D2'])
onehot_sim_manual = manual_cosine(onehot_vectors['D1'], onehot_vectors['D2'])

print(f"\nManual calculation:")
print(f"BoW cosine similarity: {bow_sim_manual:.4f}")
print(f"One-Hot cosine similarity: {onehot_sim_manual:.4f}")

print("\nKey Insight:")
print("BoW considers word frequency, so 'movie' appearing once in each")
print("document contributes differently than if it appeared multiple times.")
print("One-Hot only cares about presence, making it more binary in nature.")

print("\n" + "="*80)
print("SUMMARY: BOW vs ONE-HOT ENCODING")
print("="*80)
print("\nChoose based on your specific use case:")
print("• BoW for frequency-sensitive tasks")
print("• One-Hot for presence-based tasks")
print("• Both lose semantic meaning and word order")
print("• Consider advanced methods (TF-IDF, word embeddings) for better performance")

BAG OF WORDS vs ONE-HOT ENCODING COMPARISON

INPUT DOCUMENTS:
------------------------------
D1: 'This movie star had dinner with my friend.'
D2: 'I watch this movie on star movies.'
D3: 'In this movie the opening was focused on the stars in the sky.'

PART 1: BAG OF WORDS (BOW) IMPLEMENTATION

STEP 1: PREPROCESSING AND TOKENIZATION
--------------------------------------------------
D1: ['this', 'movie', 'star', 'had', 'dinner', 'with', 'my', 'friend']
D2: ['i', 'watch', 'this', 'movie', 'on', 'star', 'movies']
D3: ['in', 'this', 'movie', 'the', 'opening', 'was', 'focused', 'on', 'the', 'stars', 'in', 'the', 'sky']

All tokens combined: ['this', 'movie', 'star', 'had', 'dinner', 'with', 'my', 'friend', 'i', 'watch', 'this', 'movie', 'on', 'star', 'movies', 'in', 'this', 'movie', 'the', 'opening', 'was', 'focused', 'on', 'the', 'stars', 'in', 'the', 'sky']
Total tokens: 28

STEP 2: CREATE GLOBAL VOCABULARY
----------------------------------------
Vocabulary: ['dinner', 'focused', 'frien