In [1]:
# !pip install scikit-learn gensim


In [2]:
# -------------------------------
# FEATURE EXTRACTION FOR ML
# -------------------------------

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import Word2Vec

# -------------------------------
# 1. Load Data
# -------------------------------
data = pd.DataFrame({
    'text': [
        "Natural Language Processing is amazing",
        "Text preprocessing cleans and prepares data",
        "Feature extraction turns text into numbers"
    ]
})


In [3]:
# -------------------------------
# 2. Bag of Words
# -------------------------------
cv = CountVectorizer()
bow = cv.fit_transform(data['text'])
bow_df = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names_out())
print("Bag of Words:\n", bow_df)


Bag of Words:
    amazing  and  cleans  data  extraction  feature  into  is  language  \
0        1    0       0     0           0        0     0   1         1   
1        0    1       1     1           0        0     0   0         0   
2        0    0       0     0           1        1     1   0         0   

   natural  numbers  prepares  preprocessing  processing  text  turns  
0        1        0         0              0           1     0      0  
1        0        0         1              1           0     1      0  
2        0        1         0              0           0     1      1  


In [4]:
# -------------------------------
# 3. TF-IDF
# -------------------------------
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print("TF-IDF:\n", tfidf_df)

TF-IDF:
     amazing       and    cleans      data  extraction   feature      into  \
0  0.447214  0.000000  0.000000  0.000000    0.000000  0.000000  0.000000   
1  0.000000  0.423394  0.423394  0.423394    0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000    0.423394  0.423394  0.423394   

         is  language   natural   numbers  prepares  preprocessing  \
0  0.447214  0.447214  0.447214  0.000000  0.000000       0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.423394       0.423394   
2  0.000000  0.000000  0.000000  0.423394  0.000000       0.000000   

   processing      text     turns  
0    0.447214  0.000000  0.000000  
1    0.000000  0.322002  0.000000  
2    0.000000  0.322002  0.423394  


In [5]:
# -------------------------------
# 4. Word2Vec
# -------------------------------
tokenized_data = [sentence.lower().split() for sentence in data['text']]
w2v_model = Word2Vec(sentences=tokenized_data, vector_size=50, window=3, min_count=1, workers=4)
print("Vector for 'text':", w2v_model.wv['text'])

Vector for 'text': [-1.07247464e-03  4.72871558e-04  1.02068903e-02  1.80188827e-02
 -1.86062474e-02 -1.42338844e-02  1.29179871e-02  1.79463141e-02
 -1.00310445e-02 -7.52688432e-03  1.47612859e-02 -3.06700030e-03
 -9.07339714e-03  1.31083494e-02 -9.72050335e-03 -3.63210333e-03
  5.75326756e-03  1.98378484e-03 -1.65707413e-02 -1.88979898e-02
  1.46238059e-02  1.01407142e-02  1.35156401e-02  1.52575970e-03
  1.27020190e-02 -6.81085931e-03 -1.89283828e-03  1.15373628e-02
 -1.50435576e-02 -7.87235424e-03 -1.50234457e-02 -1.86011929e-03
  1.90765951e-02 -1.46386083e-02 -4.66762483e-03 -3.87555477e-03
  1.61551777e-02 -1.18620144e-02  9.03265754e-05 -9.50764585e-03
 -1.92074608e-02  1.00147743e-02 -1.75194982e-02 -8.78381543e-03
 -7.02012840e-05 -5.92374010e-04 -1.53227672e-02  1.92298461e-02
  9.96430311e-03  1.84666328e-02]
