### One Hot Encoding

In [1]:
corpus = ["I love NLP","I teach GenAI","I am working with euron"]

In [2]:
vocabulary = list(set(" ".join(corpus).split()))

In [3]:
vocabulary

['am', 'with', 'NLP', 'teach', 'love', 'working', 'GenAI', 'euron', 'I']

In [9]:
word_to_index = {}
for i,word in enumerate(vocabulary):
    word_to_index[word]=i

In [10]:
word_to_index

{'am': 0,
 'with': 1,
 'NLP': 2,
 'teach': 3,
 'love': 4,
 'working': 5,
 'GenAI': 6,
 'euron': 7,
 'I': 8}

In [12]:
one_hot_vector = []

for sentence in corpus:
    sentence_vector = []
    for word in sentence.split():
        vector = [0]*len(vocabulary)
        vector[word_to_index[word]] = 1
        sentence_vector.append(vector)
    one_hot_vector.append(sentence_vector)

one_hot_vector

[[[0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 0, 1, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 1, 0, 0]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 1],
  [1, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 1, 0]]]

### Bag of Words

In [14]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Obtaining dependency information for numpy>=1.19.5 from https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 320.0 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/60.8 kB 518.5 kB/s eta 0:00:01
     -------------------------------------- 60.8/60


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
corpus

['I love NLP', 'I teach GenAI', 'I am working with euron']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [35]:
X.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 1]])

In [36]:
vectorizer.get_feature_names_out()

array(['am', 'euron', 'genai', 'love', 'nlp', 'teach', 'with', 'working'],
      dtype=object)

### TF-IDF (Term Frequency -  Inverse Document Frequency)

 Term Frequency = (Number of times word repeated in Document)/(Total number of words in Document)

 Inverse Document Frequency = log(Total number of documents/No of documents with that word)

 

In [37]:
corpus

['I love NLP', 'I teach GenAI', 'I am working with euron']

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect_tf_idf = TfidfVectorizer()



In [39]:
Y = vect_tf_idf.fit_transform(corpus)

In [40]:
Y.toarray()

array([[0.        , 0.        , 0.        , 0.70710678, 0.70710678,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.70710678, 0.        , 0.        ],
       [0.5       , 0.5       , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.5       ]])