## Natural Language Processing Example

### Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
try:
    from gensim.models import word2vec
except:
    !pip install gensim
    from gensim.models import word2vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OLEKSANDRRomanko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import data

We have 5 examples of documents (tweets).

In [2]:
corpus = [
'All bears are lovely',
'Our tea was bad',
'That bear drinks with bear',
'The bear drinks tea',
'We love bears'
]

In [12]:
for i in range(len(corpus)):
    print('Example #{0:d}: "{1:s}"'.format(i+1,corpus[i]))

Example #1: "All bears are lovely"
Example #2: "Our tea was bad"
Example #3: "That bear drinks with bear"
Example #4: "The bear drinks tea"
Example #5: "We love bears"


### Clean Data

Convert to lower case, remove stop words, stem words, etc.

In [13]:
list_lc = []
s1 = ' '
corpus_clean = []
for line in corpus:
    lower_case = line.lower() # lowercase 
    list_lc.append(lower_case)
    tokenizer = RegexpTokenizer(r'\w+')         
    b = tokenizer.tokenize(lower_case)
    words_rmStop = [word for word in b if word not in stopwords.words('english')] # remove stop words
    ps = PorterStemmer()
    words_stem = [ps.stem(word) for word in words_rmStop] # stem 
    corpus_clean.append(s1.join(words_stem))

In [15]:
for i in range(len(corpus)):
    print('Cleaned example #{0:d}: "{1:s}"'.format(i+1,corpus_clean[i]))

Cleaned example #1: "bear love"
Cleaned example #2: "tea bad"
Cleaned example #3: "bear drink bear"
Cleaned example #4: "bear drink tea"
Cleaned example #5: "love bear"


### Word Frequency (WF)

The "word frequency" (WF) method records the number of times that term occurs in a document.

In [85]:
vectorizer = CountVectorizer()
array_WF = vectorizer.fit_transform(corpus_clean).toarray()
#print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
print(array_WF)

['bad', 'bear', 'drink', 'love', 'tea']
[[0 1 0 1 0]
 [1 0 0 0 1]
 [0 2 1 0 0]
 [0 1 1 0 1]
 [0 1 0 1 0]]


Print results for the "bag of words" (WF) representation.

In [86]:
#import operator
#sorted_voc = sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))
sorted_voc = vectorizer.get_feature_names()
print('\t\t', end = '')
for j in range(len(vectorizer.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
    #print('{0:7s}'.format(sorted_voc[j][0]), end = '')
print('\t')
for j in range(len(vectorizer.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_WF.shape[0]):
        print('{0:7d}'.format(array_WF[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

		bad    bear   drink  love   tea    	
Example #1      0      1      0      1      0         "bear love"	
Example #2      1      0      0      0      1         "tea bad"	
Example #3      0      2      1      0      0         "bear drink bear"	
Example #4      0      1      1      0      1         "bear drink tea"	
Example #5      0      1      0      1      0         "love bear"	


### Term Frequency (TF)


Term frequency method is used in order to reduce influence of a document length.

The way to calculate it: $\frac{\rm Word ~ Frequency}{\rm total ~ number ~ of ~ words ~ in ~ the ~ document}$.

In [83]:
array_TF = array_WF/array_WF.sum(axis=1,keepdims=True)
print(vectorizer.get_feature_names())
print(array_TF)

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         0.5        0.         0.5        0.        ]
 [0.5        0.         0.         0.         0.5       ]
 [0.         0.66666667 0.33333333 0.         0.        ]
 [0.         0.33333333 0.33333333 0.         0.33333333]
 [0.         0.5        0.         0.5        0.        ]]


You get the same results using `TfidfVectorizer`.

In [82]:
vectorizer2 = TfidfVectorizer(use_idf=False, norm="l1")
array_TF1 = vectorizer2.fit_transform(corpus_clean).toarray()
print(vectorizer2.get_feature_names())
print( array_TF1 )

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         0.5        0.         0.5        0.        ]
 [0.5        0.         0.         0.         0.5       ]
 [0.         0.66666667 0.33333333 0.         0.        ]
 [0.         0.33333333 0.33333333 0.         0.33333333]
 [0.         0.5        0.         0.5        0.        ]]


Print results for the "term frequency" (TF) representation.

In [88]:
sorted_voc = vectorizer2.get_feature_names()
print('\t     ', end = '')
for j in range(len(vectorizer2.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
print('\t')
for j in range(len(vectorizer2.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_TF1.shape[0]):
        print('{0:7.2f}'.format(array_TF1[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

	     bad    bear   drink  love   tea    	
Example #1   0.00   0.50   0.00   0.50   0.00         "bear love"	
Example #2   0.50   0.00   0.00   0.00   0.50         "tea bad"	
Example #3   0.00   0.67   0.33   0.00   0.00         "bear drink bear"	
Example #4   0.00   0.33   0.33   0.00   0.33         "bear drink tea"	
Example #5   0.00   0.50   0.00   0.50   0.00         "love bear"	


### Term Frequency–Inverse Document Frequency (TF-IDF)

The formula that is used to compute the $\mbox{tf-idf }$ of term $t$ is

$\mbox{tf-idf}(d, t) = \mbox{tf}(t) \cdot \mbox{idf}(d, t)$

There are a number of ways to calculate $\mbox{tf}$ and $\mbox{idf}$. According to `TfidfVectorizer` documentation

$\mbox{tf}(t)$ here is word frequency,

if `smooth_idf=False`, 
$\mbox{idf}$ is computed as $\mbox{idf}(d, t) = \log \left[ \frac{n}{{\rm df}(d, t)} \right] + 1$,

if `smooth_idf=True`, 
$\mbox{idf}$ is computed as $\mbox{idf}(d, t) = \log \left[ \frac{ 1+n }{ 1+{\rm df}(d, t) } \right] + 1$,

where $n$ is the total number of documents and $\mbox{df}(d, t)$ is the document frequency.

In [90]:
vectorizer3 = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
array_TFIDF = vectorizer3.fit_transform(corpus_clean).toarray()
print( vectorizer3.get_feature_names() )
print( array_TFIDF )

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         1.18232156 0.         1.69314718 0.        ]
 [2.09861229 0.         0.         0.         1.69314718]
 [0.         2.36464311 1.69314718 0.         0.        ]
 [0.         1.18232156 1.69314718 0.         1.69314718]
 [0.         1.18232156 0.         1.69314718 0.        ]]


Print results for the "term frequency - inverse document frequency" (TF-IDF) representation.

In [89]:
sorted_voc = vectorizer3.get_feature_names()
print('\t     ', end = '')
for j in range(len(vectorizer3.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
print('\t')
for j in range(len(vectorizer3.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_TFIDF.shape[0]):
        print('{0:7.2f}'.format(array_TFIDF[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

	     bad    bear   drink  love   tea    	
Example #1   0.00   1.18   0.00   1.69   0.00         "bear love"	
Example #2   2.10   0.00   0.00   0.00   1.69         "tea bad"	
Example #3   0.00   2.36   1.69   0.00   0.00         "bear drink bear"	
Example #4   0.00   1.18   1.69   0.00   1.69         "bear drink tea"	
Example #5   0.00   1.18   0.00   1.69   0.00         "love bear"	


### Word Embedding

https://towardsdatascience.com/word-embeddings-exploration-explanation-and-exploitation-with-code-in-python-5dac99d5d795

Here we use `Word2Vec` as example.
There are a lot of ways to use the word embedding as features, here we use joining (averaging) vectors from the words from sentence.

In [92]:
tokenized_sentences = [sentence.split() for sentence in corpus_clean]
model = word2vec.Word2Vec(tokenized_sentences, size=100, min_count=1)



In [93]:
model['love']

  """Entry point for launching an IPython kernel.



array([ 0.00201971,  0.0044131 , -0.00188085,  0.00446221, -0.00101235,
        0.00336833,  0.00483887, -0.00391704, -0.00100387, -0.00132857,
        0.00479649, -0.0023378 , -0.00184719,  0.00059363, -0.00248506,
       -0.00480535, -0.00199169, -0.00101469, -0.00466638, -0.00298863,
        0.00348795, -0.00208034,  0.00323432,  0.00089631, -0.00158379,
        0.00470077, -0.00437122, -0.00127288,  0.00344179,  0.00313251,
       -0.00368983,  0.00277271,  0.00407635,  0.00464457, -0.00481355,
        0.00304931,  0.00108448,  0.00011703, -0.00156844, -0.00179355,
        0.00135169, -0.00398491, -0.00296204, -0.00027267,  0.00409198,
       -0.00015581, -0.00379952,  0.00221578,  0.0015273 ,  0.00309993,
        0.00468623, -0.00387594, -0.00100491, -0.00405672, -0.00273765,
       -0.00189837,  0.00187329, -0.00186427,  0.00172515,  0.00349746,
       -0.00321778, -0.00095087,  0.00438536,  0.00034652,  0.00437058,
        0.00460702,  0.00441507, -0.00016692, -0.00429769, -0.00

In [94]:
model.most_similar(['love'])

  """Entry point for launching an IPython kernel.



[('tea', 0.04989343136548996),
 ('bear', -0.03948143869638443),
 ('bad', -0.04141325503587723),
 ('drink', -0.06769246608018875)]

In [95]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    text = text.split(' ')
    for word in text:
        vec += model[word].reshape((1, size))
        count += 1.
    if count != 0:
        vec /= count
    return vec

In [96]:
array_wordEmbedding = np.concatenate([buildWordVector(z, 100) for z in corpus_clean])

  



In [97]:
print( array_wordEmbedding )

[[ 2.41971167e-04  8.54137121e-04  1.17415149e-03  4.63191746e-03
  -6.03307562e-05  3.10892600e-03  3.60774726e-03  3.79402656e-04
   9.75768664e-04  5.12669038e-04  3.91771831e-03 -1.50340647e-03
   1.08108588e-03 -6.18075137e-06 -2.76957871e-04 -3.28093220e-03
  -1.82390388e-04  1.84900156e-03  1.20697077e-04 -9.53201263e-04
   2.09853752e-05 -3.18862859e-03  1.17352116e-04  6.92134694e-04
   1.05383032e-03  4.00270883e-03 -3.78950220e-03 -3.52097355e-04
   3.03843187e-03  1.44166926e-03 -7.86845689e-04 -1.00596051e-03
   1.44461612e-03  4.47982270e-03 -2.16282866e-03 -9.43378545e-05
   1.94498088e-03  5.97301045e-04 -1.16750022e-03 -2.96385260e-05
   8.67735580e-04 -1.90259558e-03  7.20671378e-04 -1.97848966e-03
   2.38357892e-03 -2.53623130e-03 -4.39201796e-03 -3.58355232e-04
  -1.50532206e-03  2.74961535e-03  4.13743942e-03  5.60019631e-04
  -1.88610528e-03 -2.96315702e-03 -2.15398712e-03 -1.10591236e-03
   2.45791767e-03  1.34875003e-03  1.08548875e-03  1.22183905e-03
  -2.16459