Practical No: 2

Perform bag-of-words approach (count occurrence,
normalized count occurrence), TF-IDF on data. Create
embeddings using Word2Vec.

In [3]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [5]:
data = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

Bag-of-Words Approach: Count Occurrence

In [6]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(data)
count_feature_names = count_vectorizer.get_feature_names_out()
count_array = count_matrix.toarray()

In [7]:
print("Count Occurrence:")
print(count_feature_names)
print(count_array)

Count Occurrence:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


Bag-of-Words Approach: Normalized Count Occurrence

In [8]:
normalized_count_array = count_array / count_array.sum(axis=1, keepdims=True)
print("\nNormalized Count Occurrence:")
print(normalized_count_array)


Normalized Count Occurrence:
[[0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.33333333 0.         0.16666667 0.         0.16666667
  0.16666667 0.         0.16666667]
 [0.16666667 0.         0.         0.16666667 0.16666667 0.
  0.16666667 0.16666667 0.16666667]
 [0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]]


TF-IDF : Term Frequency - Inverse Document Frequency

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

In [11]:
print("\nTF-IDF:")
print(tfidf_feature_names)
print(tfidf_array)


TF-IDF:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


Word Embeddings using Word2Vec

In [13]:
tokenized_data = [word_tokenize(sentence.lower()) for sentence in data]
word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = [word2vec_model.wv[word] for sentence in tokenized_data for word in sentence]

In [14]:
print("\nWord Embeddings using Word2Vec:")
print(word_vectors)


Word Embeddings using Word2Vec:
[array([-8.2432879e-03,  9.2997188e-03, -1.9789736e-04, -1.9665232e-03,
        4.6039713e-03, -4.0952349e-03,  2.7434404e-03,  6.9401748e-03,
        6.0646827e-03, -7.5105582e-03,  9.3826018e-03,  4.6713995e-03,
        3.9668484e-03, -6.2435791e-03,  8.4605524e-03, -2.1501766e-03,
        8.8252556e-03, -5.3621130e-03, -8.1300512e-03,  6.8239011e-03,
        1.6711351e-03, -2.1979925e-03,  9.5138699e-03,  9.4933100e-03,
       -9.7733382e-03,  2.5049641e-03,  6.1566979e-03,  3.8725948e-03,
        2.0224224e-03,  4.2999926e-04,  6.7419501e-04, -3.8210566e-03,
       -7.1399929e-03, -2.0883142e-03,  3.9232722e-03,  8.8190036e-03,
        9.2592677e-03, -5.9755249e-03, -9.4025033e-03,  9.7639700e-03,
        3.4294038e-03,  5.1667099e-03,  6.2821540e-03, -2.8043089e-03,
        7.3228502e-03,  2.8303931e-03,  2.8708240e-03, -2.3799157e-03,
       -3.1284206e-03, -2.3699815e-03,  4.2763930e-03,  7.6372569e-05,
       -9.5847175e-03, -9.6658710e-03, -6.1