# Basic Vectorization Approaches

## Text Representation Schemes

### One-Hot Encoding

In [None]:
count = 0
vocab = {}
processed_docs = [
    "It is a long stablished fact that is", 
    "Lorem Ipsum is simply"
    ]

for doc in processed_docs:
  for word in doc.split():
    if word not in vocab:
      count += 1
    vocab[word] = count

print(vocab) 
print(len(vocab))

def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
    temp = [0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1]=1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
    onehot_encoded.append(temp)
  return onehot_encoded

get_onehot_vector(processed_docs[1])

{'It': 1, 'is': 9, 'a': 3, 'long': 4, 'stablished': 5, 'fact': 6, 'that': 7, 'Lorem': 8, 'Ipsum': 9, 'simply': 10}
10


[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

### Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

# Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary)

# See the BOW representation for first 2 documents
print("BoW representation for 'It is a long stablished fact that is': ", bow_rep[0].toarray())
print("BoW representation for 'Loren Ipsum is simply': ", bow_rep[1].toarray())

# Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["long stablished Loren is is long"])
print("long stablished Loren is", temp.toarray())

Our vocabulary:  None
BoW representation for 'It is a long stablished fact that is':  [[1 0 2 1 1 0 0 1 1]]
BoW representation for 'Loren Ipsum is simply':  [[0 1 1 0 0 1 1 0 0]]
long stablished Loren is [[0 0 2 0 2 0 0 1 0]]


### Bag of N-grams

In [None]:
# quick pre-process step
print(processed_docs)
processed_docs2 = [doc.lower().replace(".", "") for doc in processed_docs]

print(processed_docs2)

['It is a long stablished fact that is', 'Lorem Ipsum is simply']
['it is a long stablished fact that is', 'lorem ipsum is simply']


In [None]:
# Instance with uni, bi, and trigrams
count_vect = CountVectorizer(ngram_range=(1, 3))

# build a bow representation of the corpus
bow_rep = count_vect.fit_transform(processed_docs2)

# Look at the vocabulary mapping
print("Our vocabulary", count_vect.vocabulary_)

# get the representation using this vocabulary, for a new text
temp = count_vect.transform(["it is not so long ago that ipsum gone"])
print("Bow representation for 'it is not so long ago that ipsum gone'", temp.toarray())
# quick pre-process step
print(processed_docs)
processed_docs2 = [doc.lower().replace(".", "") for doc in processed_docs]

print(processed_docs2)

Our vocabulary {'it': 10, 'is': 6, 'long': 13, 'stablished': 20, 'fact': 0, 'that': 23, 'it is': 11, 'is long': 7, 'long stablished': 14, 'stablished fact': 21, 'fact that': 1, 'that is': 24, 'it is long': 12, 'is long stablished': 8, 'long stablished fact': 15, 'stablished fact that': 22, 'fact that is': 2, 'lorem': 16, 'ipsum': 3, 'simply': 19, 'lorem ipsum': 17, 'ipsum is': 4, 'is simply': 9, 'lorem ipsum is': 18, 'ipsum is simply': 5}
Bow representation for 'it is not so long ago that ipsum gone' [[0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0]]
['It is a long stablished fact that is', 'Lorem Ipsum is simply']
['it is a long stablished fact that is', 'lorem ipsum is simply']


### TF-IDF

In [None]:
processed_documents = ["boys are humans", "girls are humans", "boy and girls are people", "boys love dogs", "dogs bark people"]

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert a collection of raw documents into a matrix of TF-IDF features
tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_documents)

# IDF for all words in the vocabulary
print("IDF for all words in the vocabulary\n", tfidf.idf_)

# All words int the vocabulary
print("All words int the vocabulary\n", tfidf.get_feature_names_out())

temp = tfidf.transform(["boys and girls love dogs"])
print("Tfidf representation for 'boys and girls love dogs'", temp.toarray())

IDF for all words in the vocabulary
 [2.09861229 1.40546511 2.09861229 2.09861229 1.69314718 1.69314718
 1.69314718 1.69314718 2.09861229 1.69314718]
All words int the vocabulary
 ['and' 'are' 'bark' 'boy' 'boys' 'dogs' 'girls' 'humans' 'love' 'people']
Tfidf representation for 'boys and girls love dogs' [[0.50297966 0.         0.         0.         0.40580082 0.40580082
  0.40580082 0.         0.50297966 0.        ]]


# Distributed Representations

## Word Embeddings

In [None]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
import kaggle

dataset_name = 'leadbest/googlenewsvectorsnegative300'
destination_path = 'archive'

kaggle.api.dataset_download_files(dataset_name, path=destination_path, unzip=True)

In [None]:
# https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#7-methods-like-most_similar-wmdistance-doesnt_match-similarity--others-moved-to-keyedvectors
from gensim.models import Word2Vec, KeyedVectors

pretrainedPath = "/content/archive/GoogleNews-vectors-negative300.bin"
w2v_model = KeyedVectors.load_word2vec_format(pretrainedPath, binary=True)

print("done loading Word2Vec")
print(len(w2v_model.key_to_index)) # Number of words in the vocabulary.

done loading Word2Vec
3000000


In [None]:
w2v_model.most_similar('effort')

[('efforts', 0.7569211721420288),
 ('attempt', 0.6435876488685608),
 ('ef_fort', 0.5995086431503296),
 ('concerted_effort', 0.5943657755851746),
 ('endeavor', 0.5759328007698059),
 ('eff_ort', 0.5580997467041016),
 ('initiative', 0.5448550581932068),
 ('eff_orts', 0.5061603784561157),
 ('Effort', 0.5019460320472717),
 ('attempts', 0.4948119819164276)]

In [None]:
# vector representation for a word
w2v_model['effort'].shape
w2v_model['effort']

array([ 1.56250000e-01,  2.02148438e-01, -8.30078125e-02,  7.32421875e-02,
       -1.70898438e-02,  1.27563477e-02,  1.92382812e-01, -2.04101562e-01,
       -1.63574219e-02,  6.05468750e-02, -1.03027344e-01, -8.74023438e-02,
        1.98974609e-02, -1.78710938e-01, -1.01562500e-01, -1.15722656e-01,
        9.61914062e-02,  9.61914062e-02,  1.71875000e-01, -3.39355469e-02,
        6.13403320e-03,  1.45507812e-01,  4.15039062e-02, -9.47265625e-02,
       -1.13769531e-01, -7.17773438e-02, -1.01562500e-01, -5.98144531e-02,
       -8.10546875e-02,  2.51464844e-02,  1.03515625e-01, -1.77734375e-01,
        2.05078125e-01, -1.04492188e-01,  2.36328125e-01,  2.23632812e-01,
        2.65625000e-01, -6.40869141e-03,  2.18750000e-01,  2.19726562e-01,
        2.14843750e-01,  5.88378906e-02, -9.32617188e-02, -6.34765625e-02,
       -1.50390625e-01, -9.13085938e-02, -2.12890625e-01, -4.83398438e-02,
       -1.25976562e-01,  2.38037109e-02,  1.91406250e-01,  1.74804688e-01,
        9.66796875e-02,  