## Text Vectorization

Question: What is text vectorization?

Answer: **The process to transform text data to numerical vectors**

In [2]:
from collections import Counter
import spacy

nlp = spacy.load("en_core_web_sm")

text = '''dog bites man. man bites dog. dog eats meat. man eats food.'''
doc = nlp(text)

## Bag of Word (BoW) or Term document matrices (TDM)

- Bag-of-Words (BoW) or TDM is a matrix where its **rows are sentences** and its **columns are unique words** seen across all of the sentences

In [3]:
stop_word = ['.']
def count_words(sentence):
    frequency = Counter()
    for word in sentence:
        if word.text not in stop_word:
            frequency[word.text] += 1
    return frequency

In [82]:
## the 'master' set, keeps track of the words in all documents
all_words = set()

## store the word frequencies by book
all_doc_frequencies = {}

## loop over the sentences
for j, sentence in enumerate(doc.sents):
    frequency = count_words(sentence)
    all_doc_frequencies[j] = frequency
    doc_words = set(frequency.keys())
    all_words = all_words.union(doc_words)

In [97]:
all_doc_frequencies

{0: Counter({'dog': 1, 'bites': 1, 'man': 1}),
 1: Counter({'man': 1, 'bites': 1, 'dog': 1}),
 2: Counter({'dog': 1, 'eats': 1, 'meat': 1}),
 3: Counter({'man': 1, 'eats': 1, 'food': 1})}

In [96]:
import numpy as np

## create a matrix of zeros: (documents) x (words)
TDM = np.zeros((len(all_doc_frequencies), len(all_words)))
## fix a word ordering for the columns
all_words = sorted(list(all_words))
## loop over the (sorted) document numbers and (ordered) words; fill in matrix
for i in all_doc_frequencies:
    for j, word in enumerate(all_words):
        TDM[i,j] = all_doc_frequencies[i][word]

In [84]:
all_words

['bites', 'dog', 'eats', 'food', 'man', 'meat']

In [55]:
TDM

array([[1., 1., 0., 0., 1., 0.],
       [1., 1., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1.],
       [0., 0., 1., 1., 1., 0.]])

In [94]:
all_words.index('food')

3

In [93]:
print(TDM[:, all_words.index('food')])

[0. 0. 0. 1.]


# TF in Sklearn

In [98]:
from sklearn.feature_extraction.text import CountVectorizer

documents = ['Dog bites man.', ' Man bites dog.', 'Dog eats meat.', 'Man eats food.']
count_vect = CountVectorizer()
tf_matrix = count_vect.fit_transform(documents)

In [99]:
tf_matrix.toarray()

array([[1, 1, 0, 0, 1, 0],
       [1, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 1],
       [0, 0, 1, 1, 1, 0]])

In [100]:
print(count_vect.get_feature_names())

['bites', 'dog', 'eats', 'food', 'man', 'meat']


## What is TF-IDF Vectorizer?

- Word counts are a good starting point, but are very basic

An alternative is to calculate word frequencies, and by far the most popular method is called TF-IDF. 

**Term Frequency (TF)**: This summarizes how often a given word appears within a document

**Inverse Document Frequency (IDF)**: This downscales words that appear a lot across documents

<img src="TFIDF.png" width="600" height="600">

In [85]:
num_docs = TDM.shape[0]

## start off with a zero matrix of size TDM
TFIDF = np.zeros(TDM.shape)
## loop over words
for i, word in enumerate(all_words):
    ## count docs containing the word
    num_docs_containing_word = len([x for x in TDM[:,i] if x])
    print((word, num_docs_containing_word))
    ### computen the inverse document frequence of this word
    IDF = -np.log((num_docs_containing_word + 1) /(num_docs + 1)) + 1
    ## multiply this row by the IDF to transform it to TFIDF
    TFIDF[:,i] = TDM[:,i]*IDF

('bites', 2)
('dog', 3)
('eats', 2)
('food', 1)
('man', 3)
('meat', 1)


In [86]:
num_docs

4

In [88]:
TFIDF

array([[1.51082562, 1.22314355, 0.        , 0.        , 1.22314355,
        0.        ],
       [1.51082562, 1.22314355, 0.        , 0.        , 1.22314355,
        0.        ],
       [0.        , 1.22314355, 1.51082562, 0.        , 0.        ,
        1.91629073],
       [0.        , 0.        , 1.51082562, 1.91629073, 1.22314355,
        0.        ]])

In [89]:
# norm of each row in TFIDF
np.apply_along_axis(np.linalg.norm, 1, TFIDF)

array([2.29668334, 2.29668334, 2.72962349, 2.72962349])

In [90]:
for i in range(TFIDF.shape[0]):
    TFIDF[i, :] = TFIDF[i, :]/np.apply_along_axis(np.linalg.norm, 1, TFIDF)[i]
TFIDF

array([[0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.        , 0.44809973, 0.55349232, 0.        , 0.        ,
        0.70203482],
       [0.        , 0.        , 0.55349232, 0.70203482, 0.44809973,
        0.        ]])

## TFIDF in Sklearn

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

documents = ['Dog bites man.', ' Man bites dog.', 'Dog eats meat.', 'Man eats food.']
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(documents)

In [60]:
tfidf_matrix.toarray()

array([[0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.        , 0.44809973, 0.55349232, 0.        , 0.        ,
        0.70203482],
       [0.        , 0.        , 0.55349232, 0.70203482, 0.44809973,
        0.        ]])

In [92]:
print(tfidf_vect.get_feature_names())

['bites', 'dog', 'eats', 'food', 'man', 'meat']


## Activity: Obtain the keywords from TF-IDF

1- First obtain the TF-IDF matrix for given corpus

2- Do column-wise addition

3- Sort the score from highest to lowest

4- Return the associated words based on step 3

Hint: You can sort the value of a dictionary and return its associated key -> D = {'bright': 0.7, 'blue':0.86, 'sun' : 0.75}

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

def keyword_sklearn(docs, k):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(docs)
    print(tfidf_matrix.toarray())
    print(vectorizer.get_feature_names())
    tfidf_scores = np.sum(tfidf_matrix, axis=0)
    tfidf_scores = np.ravel(tfidf_scores)
    return sorted(dict(zip(vectorizer.get_feature_names(), tfidf_scores)).items(), key=lambda x: x[1], reverse=True)[:k]

documents = ['The sky is blue', 'The sun is bright', 'The sun in the sky is bright', 'we can see the shining sun, the bright sun']

print(keyword_sklearn(documents, 3))

[[0.78528828 0.         0.         0.6191303  0.        ]
 [0.         0.70710678 0.         0.         0.70710678]
 [0.         0.53256952 0.         0.65782931 0.53256952]
 [0.         0.36626037 0.57381765 0.         0.73252075]]
['blue', 'bright', 'shining', 'sky', 'sun']
[('sun', 1.9721970507561841), ('bright', 1.605936677684143), ('sky', 1.27695960978985)]


# References:

- https://medium.com/analytics-vidhya/demonstrating-calculation-of-tf-idf-from-sklearn-4f9526e7e78b
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html