# Basic Vectorization Approaches

## Text Representation Schemes

### One-Hot Encoding

In [1]:
count = 0
vocab = {}
processed_docs = [
    "It is a long stablished fact that is", 
    "Lorem Ipsum is simply"
    ]

for doc in processed_docs:
  for word in doc.split():
    if word not in vocab:
      count += 1
    vocab[word] = count

print(vocab) 
print(len(vocab))

def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
    temp = [0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1]=1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
    onehot_encoded.append(temp)
  return onehot_encoded

get_onehot_vector(processed_docs[1])

{'It': 1, 'is': 9, 'a': 3, 'long': 4, 'stablished': 5, 'fact': 6, 'that': 7, 'Lorem': 8, 'Ipsum': 9, 'simply': 10}
10


[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

### Bag of Words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

# Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary)

# See the BOW representation for first 2 documents
print("BoW representation for 'It is a long stablished fact that is': ", bow_rep[0].toarray())
print("BoW representation for 'Loren Ipsum is simply': ", bow_rep[1].toarray())

# Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["long stablished Loren is is long"])
print("long stablished Loren is", temp.toarray())

Our vocabulary:  None
BoW representation for 'It is a long stablished fact that is':  [[1 0 2 1 1 0 0 1 1]]
BoW representation for 'Loren Ipsum is simply':  [[0 1 1 0 0 1 1 0 0]]
long stablished Loren is [[0 0 2 0 2 0 0 1 0]]


### Bag of N-grams

In [3]:
# quick pre-process step
print(processed_docs)
processed_docs2 = [doc.lower().replace(".", "") for doc in processed_docs]

print(processed_docs2)

['It is a long stablished fact that is', 'Lorem Ipsum is simply']
['it is a long stablished fact that is', 'lorem ipsum is simply']


In [4]:
# Instance with uni, bi, and trigrams
count_vect = CountVectorizer(ngram_range=(1, 3))

# build a bow representation of the corpus
bow_rep = count_vect.fit_transform(processed_docs2)

# Look at the vocabulary mapping
print("Our vocabulary", count_vect.vocabulary_)

# get the representation using this vocabulary, for a new text
temp = count_vect.transform(["it is not so long ago that ipsum gone"])
print("Bow representation for 'it is not so long ago that ipsum gone'", temp.toarray())
# quick pre-process step
print(processed_docs)
processed_docs2 = [doc.lower().replace(".", "") for doc in processed_docs]

print(processed_docs2)

Our vocabulary {'it': 10, 'is': 6, 'long': 13, 'stablished': 20, 'fact': 0, 'that': 23, 'it is': 11, 'is long': 7, 'long stablished': 14, 'stablished fact': 21, 'fact that': 1, 'that is': 24, 'it is long': 12, 'is long stablished': 8, 'long stablished fact': 15, 'stablished fact that': 22, 'fact that is': 2, 'lorem': 16, 'ipsum': 3, 'simply': 19, 'lorem ipsum': 17, 'ipsum is': 4, 'is simply': 9, 'lorem ipsum is': 18, 'ipsum is simply': 5}
Bow representation for 'it is not so long ago that ipsum gone' [[0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0]]
['It is a long stablished fact that is', 'Lorem Ipsum is simply']
['it is a long stablished fact that is', 'lorem ipsum is simply']


### TF-IDF

In [5]:
processed_documents = ["boys are humans", "girls are humans", "boy and girls are people", "boys love dogs", "dogs bark people"]

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert a collection of raw documents into a matrix of TF-IDF features
tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_documents)

# IDF for all words in the vocabulary
print("IDF for all words in the vocabulary\n", tfidf.idf_)

# All words int the vocabulary
print("All words int the vocabulary\n", tfidf.get_feature_names_out())

temp = tfidf.transform(["boys and girls love dogs"])
print("Tfidf representation for 'boys and girls love dogs'", temp.toarray())

IDF for all words in the vocabulary
 [2.09861229 1.40546511 2.09861229 2.09861229 1.69314718 1.69314718
 1.69314718 1.69314718 2.09861229 1.69314718]
All words int the vocabulary
 ['and' 'are' 'bark' 'boy' 'boys' 'dogs' 'girls' 'humans' 'love' 'people']
Tfidf representation for 'boys and girls love dogs' [[0.50297966 0.         0.         0.         0.40580082 0.40580082
  0.40580082 0.         0.50297966 0.        ]]
