In [2]:
import numpy as np

In [3]:
docs = ["I love NLP", "I love AI", "AI loves me"]

### Step-1: Build Vocabulary

In [4]:
vocab = sorted(set(word.lower() for doc in docs for word in doc.split()))

In [5]:
print(vocab)

['ai', 'i', 'love', 'loves', 'me', 'nlp']


In [7]:
word_to_idx = {word: i for i, word in enumerate(vocab)}
word_to_idx

{'ai': 0, 'i': 1, 'love': 2, 'loves': 3, 'me': 4, 'nlp': 5}

### Step 2: Represent each doc as BoW Vector

In [8]:
def bow_vector(doc):
    vec = np.zeros(len(vocab))
    for word in doc.lower().split():
        vec[word_to_idx[word]] += 1
    return vec

In [9]:
x_vow = np.array([bow_vector(doc) for doc in docs])

In [10]:
print("Vocabulary:", vocab)
print("Bow Matrix: \n", x_vow)

Vocabulary: ['ai', 'i', 'love', 'loves', 'me', 'nlp']
Bow Matrix: 
 [[0. 1. 1. 0. 0. 1.]
 [1. 1. 1. 0. 0. 0.]
 [1. 0. 0. 1. 1. 0.]]


In [27]:
docs = [
    "I love NLP",
    "NLP is fun",
    "AI loves me",
    "I think NLP is great"
]

In [28]:
def tokenize(text:str):
    return text.lower().split()

In [29]:
def build_vocab(docs):
    vocab_set = set()
    for d in docs:
        for tok in tokenize(d):
            vocab_set.add(tok)
            
    vocab = sorted(vocab_set)
    word2id = {w:i for i , w in enumerate(vocab)}
    return vocab, word2id

vocab, word2id = build_vocab(docs)
print(vocab)
print(word2id)

['ai', 'fun', 'great', 'i', 'is', 'love', 'loves', 'me', 'nlp', 'think']
{'ai': 0, 'fun': 1, 'great': 2, 'i': 3, 'is': 4, 'love': 5, 'loves': 6, 'me': 7, 'nlp': 8, 'think': 9}


### Vectorize a single document (raw counts)

In [30]:
def bow_vector(doc, word2id):
    vec = [0]*len(word2id)
    for tok in tokenize(doc):
        if tok in word2id:
            vec[word2id[tok]] += 1
    return vec

print(bow_vector("I love NLP", word2id))

[0, 0, 0, 1, 0, 1, 0, 0, 1, 0]


### Build the full document-term matrix

In [31]:
def doc_term_matrix(docs, word2id):
    return [bow_vector(d, word2id) for d in docs]
    


X = doc_term_matrix(docs, word2id)
for row in X:
    print(row)

[0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
[0, 1, 0, 0, 1, 0, 0, 0, 1, 0]
[1, 0, 0, 0, 0, 0, 1, 1, 0, 0]
[0, 0, 1, 1, 1, 0, 0, 0, 1, 1]


In [32]:
def doc_term_matrix(docs, word2id):
    matrix = []
    for d in docs:
        matrix.append(bow_vector(d, word2id))
    return matrix

X= doc_term_matrix(docs, word2id)
for row in X:
    print(row)

[0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
[0, 1, 0, 0, 1, 0, 0, 0, 1, 0]
[1, 0, 0, 0, 0, 0, 1, 1, 0, 0]
[0, 0, 1, 1, 1, 0, 0, 0, 1, 1]


#### If we need a NumPy 2D array for ML 

In [33]:
import numpy as np
import pandas as pd

def doc_term_matrix(docs, word2id):
    return np.array([bow_vector(d, word2id) for d in docs])

creating the martix


In [34]:
matrix = doc_term_matrix(docs, word2id)

In [35]:
df = pd.DataFrame(matrix,
                  columns=word2id.keys(),
                  index=[f"Dox {i+1}" for i in range(len(docs))])

In [36]:
print(df)

       ai  fun  great  i  is  love  loves  me  nlp  think
Dox 1   0    0      0  1   0     1      0   0    1      0
Dox 2   0    1      0  0   1     0      0   0    1      0
Dox 3   1    0      0  0   0     0      1   1    0      0
Dox 4   0    0      1  1   1     0      0   0    1      1


In [37]:
import pandas as pd
import numpy as np

# Assume word2id looks like this:
word2id = {"I":0, "love":1, "NLP":2, "is":3, "fun":4, "great":5}

# Example documents
docs = [
    "I love NLP",
    "NLP is fun",
    "I think NLP is great"
]

# Your existing bow_vector
def bow_vector(doc, word2id):
    vec = [0] * len(word2id)
    for tok in doc.split():   # simplified tokenizer
        if tok in word2id:
            vec[word2id[tok]] += 1
    return vec

# Build doc-term matrix as numpy array
def doc_term_matrix(docs, word2id):
    return np.array([bow_vector(d, word2id) for d in docs])

# Create the matrix
matrix = doc_term_matrix(docs, word2id)

# Convert to DataFrame for pretty printing
df = pd.DataFrame(matrix, columns=word2id.keys(), index=[f"Doc {i+1}" for i in range(len(docs))])
print(df)


       I  love  NLP  is  fun  great
Doc 1  1     1    1   0    0      0
Doc 2  0     0    1   1    1      0
Doc 3  1     0    1   1    0      1


## Finalize of exercise 1 Bag-of-Words + Document-Term Matrix + Pretty Table

In [38]:
docs = [
    "I love NLP",
    "NLP is fun",
    "I think NLP is great"
]


Step 1: Tiny tokenizer & vocab

In [None]:
from collections import OrderedDict
# OrderedDict = dictionary + remembers order + special order tools.

def tokenize(text):
    # super simple for now; we’ll upgrade later
    return text.split()

def build_vocab(docs):
    vocab = OrderedDict()
    for doc in docs:
        for tok in tokenize(doc):
            if tok not in vocab:
                vocab[tok] = len(vocab)
    return vocab  # dict: token -> column index

word2id = build_vocab(docs)
word2id


OrderedDict([('I', 0),
             ('love', 1),
             ('NLP', 2),
             ('is', 3),
             ('fun', 4),
             ('think', 5),
             ('great', 6)])

Step 2: Single-doc BoW vector

In [40]:
def bow_vector(doc, word2id):
    vec = [0] * len(word2id)
    for tok in tokenize(doc):
        if tok in word2id:
            vec[word2id[tok]] += 1
    return vec

# quick unit checks
assert bow_vector("I love NLP", word2id).count(1) == 3


Step 3: Full document-term matrix

In [41]:
def doc_term_matrix(docs, word2id):
    return [bow_vector(d, word2id) for d in docs]

dtm = doc_term_matrix(docs, word2id)
dtm


[[1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 0, 0], [1, 0, 1, 1, 0, 1, 1]]

Step 4: Pretty table with pandas (optional but recommended)

In [42]:
import pandas as pd

df_bow = pd.DataFrame(dtm, columns=list(word2id.keys()),
                      index=[f"Doc {i+1}" for i in range(len(docs))])
df_bow


Unnamed: 0,I,love,NLP,is,fun,think,great
Doc 1,1,1,1,0,0,0,0
Doc 2,0,0,1,1,1,0,0
Doc 3,1,0,1,1,0,1,1
