In [1]:
!pip install gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import pandas as pd


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
docs = [
    "I love NLP",
    "NLP loves me",
    "I enjoy learning about NLP and AI"
]

print("Documents:")
for i, doc in enumerate(docs, 1):
    print(f"Doc{i}: {doc}")


Documents:
Doc1: I love NLP
Doc2: NLP loves me
Doc3: I enjoy learning about NLP and AI


In [3]:
# Bag-of-Words representation
count_vec = CountVectorizer()
bow = count_vec.fit_transform(docs)

# Convert to DataFrame for readability
bow_df = pd.DataFrame(bow.toarray(), columns=count_vec.get_feature_names_out())
print("Bag-of-Words (Count Occurrence):")
print(bow_df)


Bag-of-Words (Count Occurrence):
   about  ai  and  enjoy  learning  love  loves  me  nlp
0      0   0    0      0         0     1      0   0    1
1      0   0    0      0         0     0      1   1    1
2      1   1    1      1         1     0      0   0    1


In [4]:
# Normalize counts by dividing each row by its sum
normalized_bow_df = bow_df.div(bow_df.sum(axis=1), axis=0)
print("Normalized Bag-of-Words:")
print(normalized_bow_df)


Normalized Bag-of-Words:
      about        ai       and     enjoy  learning  love     loves        me  \
0  0.000000  0.000000  0.000000  0.000000  0.000000   0.5  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000   0.0  0.333333  0.333333   
2  0.166667  0.166667  0.166667  0.166667  0.166667   0.0  0.000000  0.000000   

        nlp  
0  0.500000  
1  0.333333  
2  0.166667  


In [5]:
# TF-IDF representation
tfidf_vec = TfidfVectorizer()
tfidf = tfidf_vec.fit_transform(docs)

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=tfidf_vec.get_feature_names_out())
print("TF-IDF:")
print(tfidf_df)


TF-IDF:
      about        ai       and     enjoy  learning      love     loves  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.861037  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.652491   
2  0.432385  0.432385  0.432385  0.432385  0.432385  0.000000  0.000000   

         me       nlp  
0  0.000000  0.508542  
1  0.652491  0.385372  
2  0.000000  0.255374  


In [6]:
# Prepare sentences (tokenized)
sentences = [doc.split() for doc in docs]

w2v_model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

print("Embedding for 'NLP':")
print(w2v_model.wv['NLP'])


Embedding for 'NLP':
[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]


In [7]:
# Find most similar words to 'NLP'
print("Words most similar to 'NLP':")
print(w2v_model.wv.most_similar('NLP'))


Words most similar to 'NLP':
[('me', 0.13204392790794373), ('AI', 0.126700758934021), ('love', 0.0998455360531807), ('I', 0.042373016476631165), ('learning', 0.012442179024219513), ('and', -0.01447527389973402), ('loves', -0.0560765340924263), ('enjoy', -0.05974648892879486), ('about', -0.11821366101503372)]
