In [1]:
from collections import defaultdict # specialised dict
import string # functions for working with string 

corpus = [
    "I hope Gen AI with LLM class will be easy and fun to learn.", 
    "How to do well in Gen AI with LLM", 
    "What time can I go home today"
]

In [2]:
def preprocess(text):
  text = text.lower() # convert to lowercase
  text = text.translate(str.maketrans("", "", string.punctuation))
  # creates a translation table mappping each punctuation make to None
  # punctuation is not needed to build our vocab and counting word freq
  tokens = text.split() # tokenize: here, split text into words: word = token
  return tokens

proc_corpus = [preprocess(sent) for sent in corpus]
print(proc_corpus)

[['i', 'hope', 'gen', 'ai', 'with', 'llm', 'class', 'will', 'be', 'easy', 'and', 'fun', 'to', 'learn'], ['how', 'to', 'do', 'well', 'in', 'gen', 'ai', 'with', 'llm'], ['what', 'time', 'can', 'i', 'go', 'home', 'today']]


In [3]:
# build vocabulary 
# initialise an empty set for the vocab 
vocab = set() 

# build the vocab 
for sent in proc_corpus:
  vocab.update(sent)

# convert to a sorted list 
vocab = sorted(list(vocab))
print("Vocab:", vocab)
print("Vocab length is", len(vocab))

Vocab: ['ai', 'and', 'be', 'can', 'class', 'do', 'easy', 'fun', 'gen', 'go', 'home', 'hope', 'how', 'i', 'in', 'learn', 'llm', 'time', 'to', 'today', 'well', 'what', 'will', 'with']
Vocab length is 24


In [4]:
# calculate word frequencies and vectorise
def create_bow(sentence, vocab):
  vector = [0] * len(vocab) # initialise a vector of zeros
  for word in sentence:
    if word in vocab:
      idx = vocab.index(word) # find index of word in vocab
      vector[idx] += 1 # increment count at index 
  return vector 

# Create BOW vector for each sentence in processed corpus
bow_vectors = [create_bow(sent, vocab) for sent in proc_corpus]
print("BOW Vectors:")
for vector in bow_vectors:
  print(vector)
print(f"Vector length in {len(vector)}") # same length since same vocab 

BOW Vectors:
[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1]
[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0]
Vector length in 24


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
# calculate cosine similarity between the BOW vectors
cosine_sim_bow = cosine_similarity(bow_vectors)

# print the cosine similarity matrix 
print("Cosine Similarity Matrix (BOW):")
print(cosine_sim_bow)

Cosine Similarity Matrix (BOW):
[[1.         0.4454354  0.10101525]
 [0.4454354  1.         0.        ]
 [0.10101525 0.         1.        ]]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Representation:")
print(X_tfidf.toarray())

Vocabulary: ['ai' 'and' 'be' 'can' 'class' 'do' 'easy' 'fun' 'gen' 'go' 'home' 'hope'
 'how' 'in' 'learn' 'llm' 'time' 'to' 'today' 'well' 'what' 'will' 'with']
TF-IDF Representation:
[[0.23044123 0.30300252 0.30300252 0.         0.30300252 0.
  0.30300252 0.30300252 0.23044123 0.         0.         0.30300252
  0.         0.         0.30300252 0.23044123 0.         0.23044123
  0.         0.         0.         0.30300252 0.23044123]
 [0.28969526 0.         0.         0.         0.         0.38091445
  0.         0.         0.28969526 0.         0.         0.
  0.38091445 0.38091445 0.         0.28969526 0.         0.28969526
  0.         0.38091445 0.         0.         0.28969526]
 [0.         0.         0.         0.40824829 0.         0.
  0.         0.         0.         0.40824829 0.40824829 0.
  0.         0.         0.         0.         0.40824829 0.
  0.40824829 0.         0.40824829 0.         0.        ]]


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# calculate cosine similarity between TF-IDF vectors
cosine_sim_tfidf = cosine_similarity(X_tfidf)

# print the cosine similarity matrix 
print("Cosine Similarity Matrix (TF-IDF):")
print(cosine_sim_tfidf)

Cosine Similarity Matrix (TF-IDF):
[[1.         0.33378865 0.        ]
 [0.33378865 1.         0.        ]
 [0.         0.         1.        ]]
