In [1]:
!pip install nltk



In [2]:
!pip install gensim



In [3]:
!pip install scikit-learn



In [4]:
corpus = ['the sky is blue', 'sky is blue and sky is beautiful', 'the beautiful sky is so blue', 'i love blue cheese']
new_doc = ['loving this blue sky today']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# BAG OF VECTOR MODEL - BOW
def bow_extractor(corpus, ngram_range=(1,1)):
  vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
  features = vectorizer.fit_transform(corpus)
  return vectorizer,features

In [7]:
# build bow vectorizer and get features
# ROWS REPRESENT DOCUMENTS COLUMNS REPRESENT WORDS
bow_vectorizer,bow_features = bow_extractor(corpus)
features = bow_features.todense()
print(features)

[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 1 1 1]
 [0 0 1 1 0 1 0 0 0]]


In [8]:
# extract features from new document using built vectorizer
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print(new_doc_features)


[[0 0 1 0 0 0 1 0 0]]


In [9]:
# print the feature names
feature_names = bow_vectorizer.get_feature_names_out()
print(feature_names)


['and' 'beautiful' 'blue' 'cheese' 'is' 'love' 'sky' 'so' 'the']


In [10]:
# Understanding the feature vectorization with detailed representation
import pandas as pd
def display_features(features, feature_names):
  df = pd.DataFrame(data=features,columns=feature_names)
  print(df)
display_features(features, feature_names)
display_features(new_doc_features, feature_names)


   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0
   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   0     0    1   0    0


In [11]:
# TF - DF MODEL
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',smooth_idf=True,use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
feature_names = bow_vectorizer.get_feature_names_out()

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
  transformer = TfidfTransformer(norm='l2',smooth_idf=True,use_idf=True)
  tfidf_matrix = transformer.fit_transform(bow_matrix)
  return transformer, tfidf_matrix

In [14]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
feature_names = bow_vectorizer.get_feature_names_out()

In [15]:
# build tfidf transformer and show train corpus tfidf features
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [16]:
# show tfidf features for new_doc using built tfidf transformer
new_doc_features = np.asarray(new_doc_features)
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [17]:
# Understanding how it works in the background
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names_out()
# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')
# show term frequencies
display_features(tf, feature_names)

   and  beautiful  blue  cheese   is  love  sky   so  the
0  0.0        0.0   1.0     0.0  1.0   0.0  1.0  0.0  1.0
1  1.0        1.0   1.0     0.0  2.0   0.0  2.0  0.0  0.0
2  0.0        1.0   1.0     0.0  1.0   0.0  1.0  1.0  1.0
3  0.0        0.0   1.0     1.0  0.0   1.0  0.0  0.0  0.0


In [18]:
# show tfidf features for new_doc using built tfidf transformer
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [19]:
# build the document frequency matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later
# show document frequencies
display_features([df], feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    2          3     5       2   4     2    4   2    3


In [20]:
# compute inverse document frequencies
total_docs = 1 + len(corpus)
idf = 1.0 + np.log(float(total_docs) / df)
# show inverse document frequencies
display_features([np.round(idf, 2)], feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  1.92       1.51   1.0    1.92  1.22  1.92  1.22  1.92  1.51


In [21]:
# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()
# print the idf diagonal matrix
print(np.round(idf, 2))


[[1.92 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   1.51 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   1.92 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.22 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.92 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.22 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.92 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.51]]


In [22]:
# Compute the tf idf feature matrix using matrix multiplication.
tfidf = tf * idf
# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00   1.0    0.00  1.22  0.00  1.22  0.00  1.51
1  1.92       1.51   1.0    0.00  2.45  0.00  2.45  0.00  0.00
2  0.00       1.51   1.0    0.00  1.22  0.00  1.22  1.92  1.51
3  0.00       0.00   1.0    1.92  0.00  1.92  0.00  0.00  0.00


#Computes the tfidf norms for each document and then divides the tfidf weights with the norm as per the formula to give us the final desired tfidf matrix

In [23]:
# compute L2 norms
norms = norm(tfidf, axis=1)
# print norms for each document
print(np.round(norms, 2))

[2.5  4.35 3.5  2.89]


In [24]:
# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]
# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


# Compare tfidf feature matrix for the documents in CORPUS to the feature matrix obtained using TfidfTransformer earlier. They are same, which means the mathematical formula appied to compute is correct.

In [25]:
# compute the tfidfbased feature matrix for new document new_doc.
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

In [26]:
# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

In [27]:
# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


# Observe again this matrix is same as earlier.
# We now implement a generic function that can directly compute the tfidf-based feature vectors for documents from the raw documents themselves.
# This generic function makes use of the TfidfVectorizer, which directly computes the tfidf vectors by taking the raw documents themselves as input and internally computing the term frequencies as well as the inverse document frequencies, eliminating the need to use the CountVectorizer for computing the term frequencies based on the Bag of Words model.

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
  vectorizer = TfidfVectorizer(min_df=1,norm='l2',smooth_idf=True,use_idf=True,ngram_range=ngram_range)
  features = vectorizer.fit_transform(corpus)
  return vectorizer, features

In [29]:
# build tfidf vectorizer and get training corpus feature vectors
tfidf_vectorizer, tdidf_features = tfidf_extractor(corpus)
display_features(np.round(tdidf_features.todense(), 2),feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


# Observe that it is again same as above for corpus as well as new_doc

In [30]:
# get tfidf feature vector for the new document
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


# Modern Word Vector Models - Advanced Word Vectorization Models :

In [31]:
# Word 2 Vector Model
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
import gensim

In [33]:
# Recall
corpus = ['the sky is blue', 'sky is blue and sky is beautiful', 'the beautiful sky is so blue', 'i love blue cheese']
new_doc = ['loving this blue sky today']

In [34]:
corpus

['the sky is blue',
 'sky is blue and sky is beautiful',
 'the beautiful sky is so blue',
 'i love blue cheese']

In [35]:
new_doc

['loving this blue sky today']

In [36]:
# tokenize corpora
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) for sentence in corpus]
print(TOKENIZED_CORPUS)
tokenized_new_doc = [nltk.word_tokenize(sentence) for sentence in new_doc]
print(tokenized_new_doc)

[['the', 'sky', 'is', 'blue'], ['sky', 'is', 'blue', 'and', 'sky', 'is', 'beautiful'], ['the', 'beautiful', 'sky', 'is', 'so', 'blue'], ['i', 'love', 'blue', 'cheese']]
[['loving', 'this', 'blue', 'sky', 'today']]


In [37]:
# build the word2vec model on our training corpus
model = gensim.models.Word2Vec(TOKENIZED_CORPUS,vector_size=10,window=10,min_count=2,sample=1e-3)

In [38]:
# model creates a vector representation for each word in the vocabulary
# Check whether model of Word2Vector type is been created
print(type(model))

<class 'gensim.models.word2vec.Word2Vec'>


In [39]:
# The Word2Vec model stores word vectors in a separate attribute called wv.
# It is a dictionary
print(model.wv.key_to_index)

{'blue': 0, 'is': 1, 'sky': 2, 'beautiful': 3, 'the': 4}


In [40]:
word = "sky"
vector = model.wv[word]
print(vector)

[ 0.07311766  0.05070262  0.06757693  0.00762866  0.06350891 -0.03405366
 -0.00946401  0.05768573 -0.07521638 -0.03936104]


In [41]:
word = "blue"
vector = model.wv[word]
print(vector)

[-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]


# Add all the word vectors and divide the result by the total number of words matched in the vocabulary to get a final resulting averaged word vector representation for the text document.

In [42]:
import numpy as np
# define function to average word vectors for a text document
def average_word_vectors(words, model, vocabulary, num_features):
  feature_vector = np.zeros((num_features,),dtype="float64")
  nwords = 0
  for word in words:
    if word in vocabulary:
      nwords = nwords + 1
      feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
      feature_vector = np.divide(feature_vector,nwords)
  return feature_vector

In [43]:
# generalize above function for a corpus of documents
def averaged_word_vectorizer(corpus, model, num_features):
  vocabulary = set(model.wv.index_to_key)
  features = [average_word_vectors(tokenized_sentence, model, vocabulary,num_features)for tokenized_sentence in corpus]
  return np.array(features)

In [44]:
import numpy as np
from gensim.models import Word2Vec

In [45]:
# Get averaged word vectors for our training CORPUS
avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS, model=model, num_features=10)
print(np.round(avg_word_vec_features, 3))

[[ 0.004  0.004  0.008  0.026 -0.025 -0.021  0.015  0.03  -0.021 -0.015]
 [-0.009 -0.002  0.015 -0.01  -0.005 -0.004  0.014 -0.009 -0.003 -0.011]
 [-0.     0.001  0.01   0.019 -0.019 -0.015  0.013  0.018 -0.011 -0.009]
 [-0.005  0.002  0.051  0.09  -0.093 -0.071  0.065  0.09  -0.05  -0.038]]


In [46]:
# Get averaged word vectors for our training CORPUS
avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc, model=model, num_features=10)
print(np.round(avg_word_vec_features, 3))

[[ 0.017  0.013  0.03   0.024 -0.007 -0.026  0.014  0.037 -0.031 -0.019]]


# From the above outputs, we can see that we have uniformly sized averaged word vectors for each document in the corpus, and these feature vectors will be used later for classification by feeding it to the ML algorithms