In [7]:
pip install scipy

Collecting scipy
  Downloading scipy-1.7.1-cp38-cp38-win_amd64.whl (33.7 MB)
Installing collected packages: scipy
Successfully installed scipy-1.7.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
sys.path.append('..')
from Vocabulary import *
import math
import scipy

sentence = '. , a b c d . , ! and then he was getting better . . , a'.split()

vocab = Vocabulary()
vocab.build_from_text(sentence)
vocab.filter_just_symbol_tokens()
print(vocab.word_frequency )
vocab.assignIds(shuffle=False)

{'a': 1, 'b': 1, 'c': 1, 'd': 1, 'and': 1, 'then': 1, 'he': 1, 'was': 1, 'getting': 1, 'better': 1}


In [2]:
class Co_Occurence_Capturer:

    def __init__(self):
        self.co_occurences = {}
        
    def _assign_entrys(self,word_ids,context_ids,dist):
        for word_id in word_ids:
                for context_id in context_ids:
                    tuple = (word_id,context_id)
                    if tuple in self.co_occurences:
                        self.co_occurences[tuple] += 1.0 / dist
                    else:
                        self.co_occurences[tuple] = 1.0 / dist


    # Window lenght is one sided length
    # The window is applied on the left and the right.
    # A window size of 0 means, just the focus_word.
    def capture_co_occurences(self,text, vocab, window_length,block_length):
        amount_split = math.ceil(vocab.get_size() / float(block_length))
        vocab.setBlock_parms(block_length)
    
        for x in range(amount_split):
            for y in range(amount_split):
                
                context_ids = []
                for focus_index,focus_word in enumerate(text):
                    
                    focus_ids = vocab.get_contrained_ids_text(focus_word,x)
                    
                    #left words
                    window_left = []
                    current_position = focus_index - 1
                    while(len(window_left) < window_length and current_position >= 0):
                        word = text[current_position]
                        if( word in vocab.word2Id):#is not filtered out word
                            window_left.insert(0,word)
                        current_position -= 1
                    for index,context_word in enumerate(window_left):
                        dist = abs(len(window_left) - index)
                        context_ids = vocab.get_contrained_ids_text(context_word,y)
                        self._assign_entrys(focus_ids,context_ids,dist) 
                        
                    #rigth words
                    window_right = []
                    current_position = focus_index + 1
                    while(len(window_right) < window_length and current_position < len(text)):
                        word = text[current_position]
                        if( word in vocab.word2Id):
                            window_right.append(word)
                        current_position += 1
                    
                    for index,context_word in enumerate(window_right):
                        dist = abs(1+ index)
                        context_ids = vocab.get_contrained_ids_text(context_word,y)
                        self._assign_entrys(focus_ids,context_ids,dist) 
                            
        return self.co_occurences
    
    def save_coocurrences(self,file_name):
        with open(file_name, 'wb') as file:
            cloudpickle.dump(self.co_occurences, file)
        self.co_occurences = {}

    def load_co_occurence(self,name):
        with open(name, 'rb+') as file:
            self.co_occurences = cloudpickle.load(file)

In [3]:
print(vocab.get_size())

10


In [4]:
capturer = Co_Occurence_Capturer()
c = capturer.capture_co_occurences(sentence,vocab,3,200)

In [5]:
print(vocab.word2Id)
print(c)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'and': 4, 'then': 5, 'he': 6, 'was': 7, 'getting': 8, 'better': 9}
{(0, 1): 1.0, (0, 2): 0.5, (0, 3): 0.3333333333333333, (1, 0): 1.0, (1, 2): 1.0, (1, 3): 0.5, (1, 4): 0.3333333333333333, (2, 0): 0.5, (2, 1): 1.0, (2, 3): 1.0, (2, 4): 0.5, (2, 5): 0.3333333333333333, (3, 0): 0.3333333333333333, (3, 1): 0.5, (3, 2): 1.0, (3, 4): 1.0, (3, 5): 0.5, (3, 6): 0.3333333333333333, (4, 1): 0.3333333333333333, (4, 2): 0.5, (4, 3): 1.0, (4, 5): 1.0, (4, 6): 0.5, (4, 7): 0.3333333333333333, (5, 2): 0.3333333333333333, (5, 3): 0.5, (5, 4): 1.0, (5, 6): 1.0, (5, 7): 0.5, (5, 8): 0.3333333333333333, (6, 3): 0.3333333333333333, (6, 4): 0.5, (6, 5): 1.0, (6, 7): 1.0, (6, 8): 0.5, (6, 9): 0.3333333333333333, (7, 4): 0.3333333333333333, (7, 5): 0.5, (7, 6): 1.0, (7, 8): 1.0, (7, 9): 0.5, (8, 5): 0.3333333333333333, (8, 6): 0.5, (8, 7): 1.0, (8, 9): 1.0, (9, 6): 0.3333333333333333, (9, 7): 0.5, (9, 8): 1.0}


In [6]:
from scipy.sparse import dok_matrix
S = dok_matrix((vocab.get_size(), vocab.get_size()), dtype=np.float32)
S._update(c)
print(S.toarray())


[[0.         1.         0.5        0.33333334 0.         0.
  0.         0.         0.         0.        ]
 [1.         0.         1.         0.5        0.33333334 0.
  0.         0.         0.         0.        ]
 [0.5        1.         0.         1.         0.5        0.33333334
  0.         0.         0.         0.        ]
 [0.33333334 0.5        1.         0.         1.         0.5
  0.33333334 0.         0.         0.        ]
 [0.         0.33333334 0.5        1.         0.         1.
  0.5        0.33333334 0.         0.        ]
 [0.         0.         0.33333334 0.5        1.         0.
  1.         0.5        0.33333334 0.        ]
 [0.         0.         0.         0.33333334 0.5        1.
  0.         1.         0.5        0.33333334]
 [0.         0.         0.         0.         0.33333334 0.5
  1.         0.         1.         0.5       ]
 [0.         0.         0.         0.         0.         0.33333334
  0.5        1.         0.         1.        ]
 [0.         0.    

In [7]:
import numpy as np

In [8]:
S.toarray().round(2)

array([[0.  , 1.  , 0.5 , 0.33, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [1.  , 0.  , 1.  , 0.5 , 0.33, 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.5 , 1.  , 0.  , 1.  , 0.5 , 0.33, 0.  , 0.  , 0.  , 0.  ],
       [0.33, 0.5 , 1.  , 0.  , 1.  , 0.5 , 0.33, 0.  , 0.  , 0.  ],
       [0.  , 0.33, 0.5 , 1.  , 0.  , 1.  , 0.5 , 0.33, 0.  , 0.  ],
       [0.  , 0.  , 0.33, 0.5 , 1.  , 0.  , 1.  , 0.5 , 0.33, 0.  ],
       [0.  , 0.  , 0.  , 0.33, 0.5 , 1.  , 0.  , 1.  , 0.5 , 0.33],
       [0.  , 0.  , 0.  , 0.  , 0.33, 0.5 , 1.  , 0.  , 1.  , 0.5 ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.33, 0.5 , 1.  , 0.  , 1.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.33, 0.5 , 1.  , 0.  ]],
      dtype=float32)