In [2]:
from collections import defaultdict # specialized dictionary
import string  # functions for working with strings

# Define your own text corpus, like below
corpus = [
    "The Gen AI and LLM class is amazing and fun. I love it already!",
    "LLMs are fun and powerful.",
    "Learning about artificial intelligence is super useful for my career!",
]


In [3]:

def preprocess(text):
    text = text.lower()   # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))
    # creates a translation table mapping each punctuation mark to None
    # punctuation is not needed to build our vocab and counting word freq
    tokens = text.split() # Tokenize: here, we will split text into words: word = token
    return tokens

proc_corpus = [preprocess(sent) for sent in corpus]
print(proc_corpus)

[['the', 'gen', 'ai', 'and', 'llm', 'class', 'is', 'amazing', 'and', 'fun', 'i', 'love', 'it', 'already'], ['llms', 'are', 'fun', 'and', 'powerful'], ['learning', 'about', 'artificial', 'intelligence', 'is', 'super', 'useful', 'for', 'my', 'career']]


In [4]:
# Build vocabulary
# Initialize an empty set for the vocab
vocab = set()

# Build the vocab
for sent in proc_corpus:
    vocab.update(sent)

# Convert to a sorted list
vocab = sorted(list(vocab))
print("Vocab:", vocab)
print("Vocab length is", len(vocab))

Vocab: ['about', 'ai', 'already', 'amazing', 'and', 'are', 'artificial', 'career', 'class', 'for', 'fun', 'gen', 'i', 'intelligence', 'is', 'it', 'learning', 'llm', 'llms', 'love', 'my', 'powerful', 'super', 'the', 'useful']
Vocab length is 25


In [5]:
# Calculate Word Frequencies and Vectorize
def create_bow(sentence, vocab):
    vector = [0] * len(vocab)  # Initialize a vector of zeros
    for word in sentence:
        if word in vocab:
            idx = vocab.index(word)  # Find index of word in  vocab
            vector[idx] += 1  # Increment count at index
    return vector

# Create BOW vector for each sentence in processed corpus
bow_vectors = [create_bow(sent, vocab) for sent in proc_corpus]
print("BOW Vectors:")
for vector in bow_vectors:
    print(vector)
print("Vector length is", len(vector))

BOW Vectors:
[0, 1, 1, 1, 2, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1]
Vector length is 25


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Representation:")
print(X.toarray())
print("Vector length is", len(X.toarray()[0]))

Vocabulary: ['about' 'ai' 'already' 'amazing' 'and' 'are' 'artificial' 'career'
 'class' 'for' 'fun' 'gen' 'intelligence' 'is' 'it' 'learning' 'llm'
 'llms' 'love' 'my' 'powerful' 'super' 'the' 'useful']
BoW Representation:
[[0 1 1 1 2 0 0 0 1 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0]
 [0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 1]]
Vector length is 24


In [7]:
# Compare vocabularies
vocab_manual = set(vocab)
vocab_sklearn = set(vectorizer.get_feature_names_out())

missing_in_sklearn = list(vocab_manual - vocab_sklearn)

print("Tokens present in the manual vocabulary but missing in the scikit-learn vocabulary:")
print(missing_in_sklearn)

Tokens present in the manual vocabulary but missing in the scikit-learn vocabulary:
['i']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Representation:")
print(X_tfidf.toarray())

Vocabulary: ['about' 'ai' 'already' 'amazing' 'and' 'are' 'artificial' 'career'
 'class' 'for' 'fun' 'gen' 'intelligence' 'is' 'it' 'learning' 'llm'
 'llms' 'love' 'my' 'powerful' 'super' 'the' 'useful']
TF-IDF Representation:
[[0.         0.28317823 0.28317823 0.28317823 0.43072869 0.
  0.         0.         0.28317823 0.         0.21536434 0.28317823
  0.         0.21536434 0.28317823 0.         0.28317823 0.
  0.28317823 0.         0.         0.         0.28317823 0.        ]
 [0.         0.         0.         0.         0.37302199 0.49047908
  0.         0.         0.         0.         0.37302199 0.
  0.         0.         0.         0.         0.         0.49047908
  0.         0.         0.49047908 0.         0.         0.        ]
 [0.32311233 0.         0.         0.         0.         0.
  0.32311233 0.32311233 0.         0.32311233 0.         0.
  0.32311233 0.24573525 0.         0.32311233 0.         0.
  0.         0.32311233 0.         0.32311233 0.         0.32311233]]


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the BOW vectors
cosine_sim_bow = cosine_similarity(bow_vectors)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix (BOW):")
print(cosine_sim_bow)

Cosine Similarity Matrix (BOW):
[[1.         0.3354102  0.07905694]
 [0.3354102  1.         0.        ]
 [0.07905694 0.         1.        ]]


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the TF-IDF vectors
cosine_sim_tfidf = cosine_similarity(X_tfidf)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix (TF-IDF):")
print(cosine_sim_tfidf)

Cosine Similarity Matrix (TF-IDF):
[[1.         0.24100691 0.05292261]
 [0.24100691 1.         0.        ]
 [0.05292261 0.         1.        ]]
