In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install gensim
from gensim.models import Word2Vec

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
# Sample Data
data = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets"
]

In [None]:
# --- 1. Bag of Words (Count Occurrence) ---
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(data)
df_bow = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

In [None]:
# --- 2. TF-IDF ---
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
# --- 3. Word2Vec (Embeddings) ---
# Word2Vec requires tokenized sentences
tokenized_data = [sentence.split() for sentence in data]
w2v_model = Word2Vec(sentences=tokenized_data, vector_size=10, window=5, min_count=1, workers=4)

In [None]:
# Look up a vector for a specific word
vector_cat = w2v_model.wv['cat']

In [None]:
# --- Display Results ---
print("--- Bag of Words (Count) ---")
print(df_bow)
print("\n--- TF-IDF ---")
print(df_tfidf.round(3))
print("\n--- Word2Vec Vector for 'cat' (First 5 dims) ---")
print(vector_cat[:5])

--- Bag of Words (Count) ---
   and  are  cat  cats  dog  dogs  great  log  mat  on  pets  sat  the
0    0    0    1     0    0     0      0    0    1   1     0    1    2
1    0    0    0     0    1     0      0    1    0   1     0    1    2
2    1    1    0     1    0     1      1    0    0   0     1    0    0

--- TF-IDF ---
     and    are    cat   cats    dog   dogs  great    log    mat     on  \
0  0.000  0.000  0.428  0.000  0.000  0.000  0.000  0.000  0.428  0.325   
1  0.000  0.000  0.000  0.000  0.428  0.000  0.000  0.428  0.000  0.325   
2  0.408  0.408  0.000  0.408  0.000  0.408  0.408  0.000  0.000  0.000   

    pets    sat   the  
0  0.000  0.325  0.65  
1  0.000  0.325  0.65  
2  0.408  0.000  0.00  

--- Word2Vec Vector for 'cat' (First 5 dims) ---
[ 0.02348376 -0.04519032  0.08388732 -0.09858163  0.06764641]
