In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import re

In [3]:
print(len(stopwords.words("english")))
print(stopwords.words("english"))

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

****

## Calculating word-embedding for the whole sentence using word-embedding of words

***In this function, we take all the individual word vectors
in a given sentence and create a normalized word vector from all word vectors of
the tokens. This provides us with a sentence vector.***

In [8]:
from nltk import word_tokenize
import numpy as np

In [4]:

def sentence_vec(text, embedding_dict, stopwords, tokenizer=word_tokenize):
    """
    Given a sentence and other parameters this function returns embeddings for whole sentence.
    : param text : any input sentence
    : param embedding_dict : dict {word:vector}
    : param stopwords : list of stopswords
    : param tokenizer : a tokenization func
    """
    
    # converting the text to the lower case
    words = str(text).lower()
    
    # tokenization of the sentence
    words = tokenizer(words)
    
    # removing the stopwords from the words 
    words = [word for word in words if word not in stopwords]
    
    # keeping only alpha-numeric tokens
    words = [word for word in words if word.isalpha()]
      
    M = []
    
    for word in words:
        # if word as key in embedding_dict then store its value in list.
        if word in embedding_dict:
            M.append(embedding_dict[word])
            
    if len(M) ==0:
        return np.zeros(300)
    
    M = np.array(M)
    
    print("Array storing the embeddings", M)
    
    # calculate sum along row, i.e for each sentences
    v = M.sum(axis=0)
    
    # Normalizing the vector
    return v/np.sqrt((v**2).sum())

### understanding the above funct

In [52]:
embedding_dict = {
    #                     X1,  X2,   X3.......... X300
    "Hello"   :        [0.98, 0.82, 0.76],
    "sam"     :        [0.77, 0.63, 0.98],
    "junior"  :        [-0.34, -.21, -0.11]
}

word = " Hello sam junior"

words = word.split()

M = []
for word in words:
    if word in embedding_dict:
        M.append(embedding_dict[word])

M = np.array(M)
print("the array--\n", M)

# adding along the same dimension for each words (X1 for each words, X2 for each words....)
v = M.sum(axis=0) 
print("array sum along the axis i.e. along same dimensions --", v)

# normalized_array = [a1, a2, a3] / sqrt(square of each-element & then sum)
v = v/np.sqrt((v**2).sum())
print("Normalized v--", v)

the array--
 [[ 0.98  0.82  0.76]
 [ 0.77  0.63  0.98]
 [-0.34 -0.21 -0.11]]
array sum along the axis i.e. along same dimensions -- [1.41 1.24 1.63]
Normalized v-- [0.56706591 0.49869626 0.65554428]


In [53]:
v = M.sum(axis=0) 
v.sum()

4.279999999999999

In [54]:
np.sqrt((v**2).sum())

2.4864834606327064

In [55]:
np.sqrt(4.279999999999999)

2.06881608655772

In [59]:
np.sqrt((v**2).sum())

2.4864834606327064