In [261]:
from collections import OrderedDict
from nltk.tokenize import word_tokenize
import numpy as np
from math import log2
from nltk import FreqDist
from numpy.linalg import svd
import matplotlib.pyplot as plt

In [262]:
document = ['This is the first book','This book is the second book','And this is the third one', 'Is this the first book']

tokenized_words = [word_tokenize(sent) for sent in document]
print("Tokenized words\n")
print(tokenized_words)

Tokenized words

[['This', 'is', 'the', 'first', 'book'], ['This', 'book', 'is', 'the', 'second', 'book'], ['And', 'this', 'is', 'the', 'third', 'one'], ['Is', 'this', 'the', 'first', 'book']]


In [263]:
lowered_words = [[w.lower() for w in word] for word in tokenized_words]
print("Lower cased words\n")
print(lowered_words)

Lower cased words

[['this', 'is', 'the', 'first', 'book'], ['this', 'book', 'is', 'the', 'second', 'book'], ['and', 'this', 'is', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'book']]


In [264]:
tokens = [w for words in lowered_words for w in words]
tokens

['this',
 'is',
 'the',
 'first',
 'book',
 'this',
 'book',
 'is',
 'the',
 'second',
 'book',
 'and',
 'this',
 'is',
 'the',
 'third',
 'one',
 'is',
 'this',
 'the',
 'first',
 'book']

In [265]:
#vocab = list(sorted(set(tokens)))
vocab = list(set(tokens))
print(vocab)

['first', 'is', 'the', 'one', 'this', 'and', 'third', 'book', 'second']


In [266]:
occurrences = OrderedDict((name, OrderedDict((name, 0) for name in vocab)) for name in vocab)

for l in lowered_words:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
                occurrences[l[i]][item] += 1

In [267]:
document

['This is the first book',
 'This book is the second book',
 'And this is the third one',
 'Is this the first book']

In [268]:
print('w/c\t|', '\t'.join(occurrences.keys()))
print("--------------------------------------------------------------------------------------")
co_occurances = []
for name, values in occurrences.items():
    co_occurances.append(values.values())
    #print(name, ' '.join(str(i) for i in values.values()))
    print(name + "\t|",'\t'.join(str(i) for i in values.values()),)

w/c	| first	is	the	one	this	and	third	book	second
--------------------------------------------------------------------------------------
first	| 0	2	2	0	2	0	0	2	0
is	| 2	0	4	1	4	1	1	4	1
the	| 2	4	0	1	4	1	1	4	1
one	| 0	1	1	0	1	1	1	0	0
this	| 2	4	4	1	0	1	1	4	1
and	| 0	1	1	1	1	0	1	0	0
third	| 0	1	1	1	1	1	0	0	0
book	| 2	4	4	0	4	0	0	2	2
second	| 0	1	1	0	1	0	0	2	0


In [269]:
co_words = list(occurrences.keys())
print(co_words)

['first', 'is', 'the', 'one', 'this', 'and', 'third', 'book', 'second']


In [270]:
document

['This is the first book',
 'This book is the second book',
 'And this is the third one',
 'Is this the first book']

In [271]:
a = []
for name, values in occurrences.items():
    for i in range(len(co_words)):
        a.append(values.popitem())

In [272]:
mat = []
for i in range(len(a)):
    mat.append(a[i][1])

In [273]:
mat = np.reshape(b,(9,9))
print(mat.T)

[[0 1 1 0 1 0 0 2 0]
 [2 4 4 0 4 0 0 2 2]
 [0 1 1 1 1 1 0 0 0]
 [0 1 1 1 1 0 1 0 0]
 [2 4 4 1 0 1 1 4 1]
 [0 1 1 0 1 1 1 0 0]
 [2 4 0 1 4 1 1 4 1]
 [2 0 4 1 4 1 1 4 1]
 [0 2 2 0 2 0 0 2 0]]


In [274]:
freqDist = FreqDist(tokens)

In [275]:
freqDist

FreqDist({'this': 4, 'is': 4, 'the': 4, 'book': 4, 'first': 2, 'second': 1, 'and': 1, 'third': 1, 'one': 1})

In [276]:
m,n = np.shape(mat)

pmi_mat = np.zeros((m,n))
for i in range(m):
    for j in range(n):
        if mat[i][j] == 0:
            pmi_mat[i][j] = 0
        else:
            w = co_words[i]
            c = co_words[j]
            #print(freqDist[w]*freqDist[c])
            pmi = log2((mat[i][j]*m)/(freqDist[w]*freqDist[c]))
            #print(int(mat[i][j]))
            if pmi <= 0:
                pmi_mat[i][j] = 0
            else:
                pmi_mat[i][j] = pmi

In [277]:
pmi_mat

array([[0.      , 1.169925, 0.      , 0.      , 1.169925, 0.      ,
        3.169925, 1.169925, 0.      ],
       [0.169925, 1.169925, 0.      , 1.169925, 1.169925, 1.169925,
        3.169925, 0.      , 2.169925],
       [0.169925, 1.169925, 0.      , 1.169925, 1.169925, 1.169925,
        0.      , 1.169925, 2.169925],
       [0.      , 0.      , 1.169925, 3.169925, 1.169925, 0.      ,
        3.169925, 1.169925, 0.      ],
       [0.169925, 1.169925, 0.      , 1.169925, 0.      , 1.169925,
        3.169925, 1.169925, 2.169925],
       [0.      , 0.      , 1.169925, 0.      , 1.169925, 3.169925,
        3.169925, 1.169925, 0.      ],
       [0.      , 0.      , 0.      , 3.169925, 1.169925, 3.169925,
        3.169925, 1.169925, 0.      ],
       [1.169925, 0.169925, 0.      , 0.      , 1.169925, 0.      ,
        3.169925, 1.169925, 2.169925],
       [0.      , 2.169925, 0.      , 0.      , 1.169925, 0.      ,
        3.169925, 1.169925, 0.      ]])

In [278]:
u,s,v = svd(pmi_mat)
word_mat = np.matmul(u,np.diag(s))
print(word_mat)

[[-3.23661990e+00  1.15916690e+00  1.43898218e+00 -1.16999633e-16
   3.94278509e-01  1.33175386e-01 -1.10404602e-15 -2.29549020e-01
   1.96641933e-01]
 [-4.15870847e+00  8.20790224e-01 -9.85420005e-01  1.50491764e-15
  -1.39166567e-01 -7.45321610e-01 -8.27261902e-01  2.48260470e-01
   1.72940166e-02]
 [-2.05397762e+00  2.40159111e-01 -2.32118515e+00 -6.26794622e-17
   1.03145148e+00  9.20987643e-01  2.83200771e-16  6.25360573e-02
   2.54845385e-02]
 [-4.08962236e+00 -1.17276404e+00  7.55502194e-01 -2.24147546e+00
  -3.44290246e-01  4.44375202e-01  1.97654273e-15  4.37706589e-01
  -6.93133590e-03]
 [-4.15870847e+00  8.20790224e-01 -9.85420005e-01 -4.21662479e-16
  -1.39166567e-01 -7.45321610e-01  8.27261902e-01  2.48260470e-01
   1.72940166e-02]
 [-4.08962236e+00 -1.17276404e+00  7.55502194e-01  2.24147546e+00
  -3.44290246e-01  4.44375202e-01  2.33373409e-15  4.37706589e-01
  -6.93133590e-03]
 [-4.98517809e+00 -2.71109327e+00 -2.83027789e-01  1.52897147e-15
   2.47231321e-01 -3.3037634

In [279]:
s = np.diag(s)
word_mat = np.matmul(u,s[:,0:10])
print(word_mat)

[[-3.23661990e+00  1.15916690e+00  1.43898218e+00 -1.16999633e-16
   3.94278509e-01  1.33175386e-01 -1.10404602e-15 -2.29549020e-01
   1.96641933e-01]
 [-4.15870847e+00  8.20790224e-01 -9.85420005e-01  1.50491764e-15
  -1.39166567e-01 -7.45321610e-01 -8.27261902e-01  2.48260470e-01
   1.72940166e-02]
 [-2.05397762e+00  2.40159111e-01 -2.32118515e+00 -6.26794622e-17
   1.03145148e+00  9.20987643e-01  2.83200771e-16  6.25360573e-02
   2.54845385e-02]
 [-4.08962236e+00 -1.17276404e+00  7.55502194e-01 -2.24147546e+00
  -3.44290246e-01  4.44375202e-01  1.97654273e-15  4.37706589e-01
  -6.93133590e-03]
 [-4.15870847e+00  8.20790224e-01 -9.85420005e-01 -4.21662479e-16
  -1.39166567e-01 -7.45321610e-01  8.27261902e-01  2.48260470e-01
   1.72940166e-02]
 [-4.08962236e+00 -1.17276404e+00  7.55502194e-01  2.24147546e+00
  -3.44290246e-01  4.44375202e-01  2.33373409e-15  4.37706589e-01
  -6.93133590e-03]
 [-4.98517809e+00 -2.71109327e+00 -2.83027789e-01  1.52897147e-15
   2.47231321e-01 -3.3037634