In [1]:
pip install nltk scikit-learn gensim

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\attar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
import numpy as np

In [4]:
documents = [
    "Natural language processing is fascinating.",
    "Language processing includes syntax and semantics.",
    "Text data needs preprocessing for NLP models."
]

In [18]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
tokenized_docs

[['natural', 'language', 'processing', 'is', 'fascinating', '.'],
 ['language', 'processing', 'includes', 'syntax', 'and', 'semantics', '.'],
 ['text', 'data', 'needs', 'preprocessing', 'for', 'nlp', 'models', '.']]

In [7]:
# 1. BAG OF WORDS - RAW COUNTS
print("\n1. Bag of Words - CountVectorizer:")
count_vectorizer = CountVectorizer()
bow_count = count_vectorizer.fit_transform(documents)
print(bow_count)


1. Bag of Words - CountVectorizer:
  (0, 8)	1
  (0, 6)	1
  (0, 12)	1
  (0, 5)	1
  (0, 2)	1
  (1, 6)	1
  (1, 12)	1
  (1, 4)	1
  (1, 14)	1
  (1, 0)	1
  (1, 13)	1
  (2, 15)	1
  (2, 1)	1
  (2, 9)	1
  (2, 11)	1
  (2, 3)	1
  (2, 10)	1
  (2, 7)	1


In [9]:
print("Vocabulary:", count_vectorizer.vocabulary_)
print("BoW Matrix (Raw Counts):\n", bow_count.toarray())

Vocabulary: {'natural': 8, 'language': 6, 'processing': 12, 'is': 5, 'fascinating': 2, 'includes': 4, 'syntax': 14, 'and': 0, 'semantics': 13, 'text': 15, 'data': 1, 'needs': 9, 'preprocessing': 11, 'for': 3, 'nlp': 10, 'models': 7}
BoW Matrix (Raw Counts):
 [[0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0]
 [1 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0]
 [0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1]]


In [10]:
# 2. BAG OF WORDS - NORMALIZED COUNTS (Term Frequency)
print("\n2. Normalized Count (Term Frequency):")
normalized_bow = bow_count.toarray().astype(float)
normalized_bow = normalized_bow / normalized_bow.sum(axis=1, keepdims=True)
print("BoW Matrix (Normalized):\n", normalized_bow)


2. Normalized Count (Term Frequency):
BoW Matrix (Normalized):
 [[0.         0.         0.2        0.         0.         0.2
  0.2        0.         0.2        0.         0.         0.
  0.2        0.         0.         0.        ]
 [0.16666667 0.         0.         0.         0.16666667 0.
  0.16666667 0.         0.         0.         0.         0.
  0.16666667 0.16666667 0.16666667 0.        ]
 [0.         0.14285714 0.         0.14285714 0.         0.
  0.         0.14285714 0.         0.14285714 0.14285714 0.14285714
  0.         0.         0.         0.14285714]]


In [11]:
# 3. TF-IDF
print("\n3. TF-IDF Vectorizer:")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Vocabulary" , tfidf_vectorizer.vocabulary_)
print("Matrix ", tfidf_matrix.toarray())


3. TF-IDF Vectorizer:
Vocabulary {'natural': 8, 'language': 6, 'processing': 12, 'is': 5, 'fascinating': 2, 'includes': 4, 'syntax': 14, 'and': 0, 'semantics': 13, 'text': 15, 'data': 1, 'needs': 9, 'preprocessing': 11, 'for': 3, 'nlp': 10, 'models': 7}
Matrix  [[0.         0.         0.49047908 0.         0.         0.49047908
  0.37302199 0.         0.49047908 0.         0.         0.
  0.37302199 0.         0.         0.        ]
 [0.44036207 0.         0.         0.         0.44036207 0.
  0.3349067  0.         0.         0.         0.         0.
  0.3349067  0.44036207 0.44036207 0.        ]
 [0.         0.37796447 0.         0.37796447 0.         0.
  0.         0.37796447 0.         0.37796447 0.37796447 0.37796447
  0.         0.         0.         0.37796447]]


In [12]:
# 4. WORD2VEC EMBEDDINGS
print("\n4. Word2Vec Embeddings:")


4. Word2Vec Embeddings:


In [14]:
from gensim.models import Word2Vec

In [15]:
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=50, window=2, min_count=1, workers=1, sg=1)

In [16]:
word = "language"
if word in w2v_model.wv:
    print(f"Embedding for '{word}':\n", w2v_model.wv[word])
else:
    print(f"'{word}' not in vocabulary.")

Embedding for 'language':
 [-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.01278507]


In [17]:
# Average Word2Vec for each sentence
print("\nAverage Word2Vec per Document:")
def document_vector(doc):
    doc = [word for word in doc if word in w2v_model.wv]
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[doc], axis=0)

avg_vectors = [document_vector(doc) for doc in tokenized_docs]
for i, vec in enumerate(avg_vectors):
    print(f"Doc {i+1} vector (shape {vec.shape}):\n", vec)


Average Word2Vec per Document:
Doc 1 vector (shape (50,)):
 [-0.01123268  0.00799814  0.00101659  0.00501263  0.00292931 -0.00828414
  0.0075939   0.00214159 -0.00620084 -0.00251304  0.00759771 -0.00477096
  0.0025906  -0.00076397  0.00258381  0.0053326   0.00630225  0.00407015
 -0.01239089  0.00063905  0.00727931  0.00712224  0.01227268 -0.00448857
  0.00267234  0.00440516  0.0049474   0.00259646 -0.00389557 -0.0045686
  0.00326978 -0.00684832  0.00150382  0.00051723  0.00027171  0.00343717
  0.00792728 -0.0035297   0.00311415  0.00406422 -0.00118404  0.00423477
  0.00030699 -0.00421173  0.00690528  0.00639761 -0.00042833  0.00110828
  0.00274217  0.00139764]
Doc 2 vector (shape (50,)):
 [-8.2321512e-03  4.2626252e-03 -5.2243657e-03 -2.0097645e-03
  7.0426852e-04 -1.7454776e-03  4.5792903e-03  7.4597066e-03
 -7.0715835e-03  4.9250026e-04 -2.9334484e-03 -1.5505851e-03
 -3.6388363e-03 -5.3067890e-04  3.0730970e-04 -4.7470122e-03
  1.1409502e-03  9.1627985e-03 -6.3393340e-03 -8.3525945e