Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on
data. Create embeddings using Word2Vec

In [None]:
!pip install nltk gensim scikit-learn



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
documents = [
    "The sun rises in the east",
    "The sun sets in the west",
    "It is the most important natural resource",
    "Life of earth would not have been possible without the sun"
]

print("Dataset:")
for i, doc in enumerate(documents, 1):
    print(f"{i}. {doc}")

Dataset:
1. The sun rises in the east
2. The sun sets in the west
3. It is the most important natural resource
4. Life of earth would not have been possible without the sun


# Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)

print("\nVocabulary:")
print(vectorizer.get_feature_names_out())

print("\nBag of Words Matrix (Count Occurrence):")
print(bow_matrix.toarray())


Vocabulary:
['been' 'earth' 'east' 'have' 'important' 'in' 'is' 'it' 'life' 'most'
 'natural' 'not' 'of' 'possible' 'resource' 'rises' 'sets' 'sun' 'the'
 'west' 'without' 'would']

Bag of Words Matrix (Count Occurrence):
[[0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 2 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 2 1 0 0]
 [0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0]
 [1 1 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1]]


# Normalized Bag of Words

In [None]:
from sklearn.preprocessing import normalize

normalized_bow = normalize(bow_matrix, norm='l1')

print("\nNormalized Bag of Words Matrix:")
print(normalized_bow.toarray())


Normalized Bag of Words Matrix:
[[0.         0.         0.16666667 0.         0.         0.16666667
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.16666667 0.         0.16666667
  0.33333333 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.16666667
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.16666667 0.16666667
  0.33333333 0.16666667 0.         0.        ]
 [0.         0.         0.         0.         0.14285714 0.
  0.14285714 0.14285714 0.         0.14285714 0.14285714 0.
  0.         0.         0.14285714 0.         0.         0.
  0.14285714 0.         0.         0.        ]
 [0.09090909 0.09090909 0.         0.09090909 0.         0.
  0.         0.         0.09090909 0.         0.         0.09090909
  0.09090909 0.09090909 0.         0.         0.         0.09090909
  0.09090909 0.         0.09090909 0.09090909]]


# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("\nTF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Vocabulary:
['been' 'earth' 'east' 'have' 'important' 'in' 'is' 'it' 'life' 'most'
 'natural' 'not' 'of' 'possible' 'resource' 'rises' 'sets' 'sun' 'the'
 'west' 'without' 'would']

TF-IDF Matrix:
[[0.         0.         0.49276768 0.         0.         0.3885037
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.49276768 0.         0.31452723
  0.51429323 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.3885037
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.49276768 0.31452723
  0.51429323 0.49276768 0.         0.        ]
 [0.         0.         0.         0.         0.39928771 0.
  0.39928771 0.39928771 0.         0.39928771 0.39928771 0.
  0.         0.         0.39928771 0.         0.         0.
  0.20836489 0.         0.         0.        ]
 [0.32141667 0.32141667 0.         0.32141667 0.         0.
  0.         0.         

# Word2Vec Embeddings

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

# Tokenize sentences
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

print("\nVocabulary in Word2Vec:")
print(w2v_model.wv.index_to_key)


Vocabulary in Word2Vec:
['the', 'sun', 'in', 'without', 'possible', 'been', 'have', 'not', 'would', 'earth', 'of', 'life', 'resource', 'natural', 'important', 'most', 'is', 'it', 'west', 'sets', 'east', 'rises']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
word = "sun"
vector = w2v_model.wv[word]
print(f"\nWord2Vec Embedding for '{word}':\n", vector)


Word2Vec Embedding for 'sun':
 [-8.6212046e-03  3.6664882e-03  5.1911823e-03  5.7407771e-03
  7.4679959e-03 -6.1686146e-03  1.1053376e-03  6.0487166e-03
 -2.8406929e-03 -6.1742957e-03 -4.1213547e-04 -8.3713057e-03
 -5.6016780e-03  7.1048136e-03  3.3528409e-03  7.2255796e-03
  6.7999526e-03  7.5316946e-03 -3.7881434e-03 -5.6530442e-04
  2.3505895e-03 -4.5175911e-03  8.3898595e-03 -9.8570567e-03
  6.7651444e-03  2.9169703e-03 -4.9345442e-03  4.3962048e-03
 -1.7400645e-03  6.7118951e-03  9.9650640e-03 -4.3621953e-03
 -5.9824984e-04 -5.6966757e-03  3.8516975e-03  2.7874780e-03
  6.8893526e-03  6.1014909e-03  9.5378626e-03  9.2714839e-03
  7.8986809e-03 -6.9913771e-03 -9.1560846e-03 -3.5371701e-04
 -3.0998415e-03  7.8926133e-03  5.9367991e-03 -1.5476035e-03
  1.5121385e-03  1.7903202e-03  7.8177862e-03 -9.5107406e-03
 -2.0507160e-04  3.4692646e-03 -9.3755248e-04  8.3820252e-03
  9.0114186e-03  6.5351953e-03 -7.1143097e-04  7.7126590e-03
 -8.5349493e-03  3.2068391e-03 -4.6372004e-03 -5.0872

In [None]:
similar_words = w2v_model.wv.most_similar("sun")
print("\nWords similar to 'sun':")
for w, score in similar_words:
    print(w, ":", score)


Words similar to 'sun':
important : 0.16075554490089417
west : 0.1592675745487213
natural : 0.13736103475093842
most : 0.12309923022985458
it : 0.08568026125431061
without : 0.06793666630983353
would : 0.03365974500775337
is : 0.022437766194343567
been : 0.009397289715707302
life : 0.00835457257926464
