In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import itertools
from collections import Counter

## Create coocurence matrix

**Step1**: preprocessing 

Here, we use a single sentence as our corpus to illustrate the computation of word co-occurence. The creation of a vocabulary and a set of index is necessary for the following calculation.

In [16]:
#create corpus
corpus_test = [["your", "model", "is", "only", "as", "good", "as", "your", "data"]]

#build vocabulary
vocab = set()
for s in corpus_test:
  vocab = vocab.union(set(s))

#get all words
words = [item for sublist in corpus_test for item in sublist]

#count the occurence of each word
counter = Counter(words)

#word occurence in the corpus
print(Counter(words))

Counter({'your': 2, 'as': 2, 'model': 1, 'is': 1, 'only': 1, 'good': 1, 'data': 1})


In [10]:
#create index for the vocabulary
vocab_index = {word: i for i, word in enumerate(vocab)}

print(vocab_index)

{'as': 0, 'model': 1, 'good': 2, 'is': 3, 'your': 4, 'data': 5, 'only': 6}

**Step2**: co-ocurrence matrix of all words

To count the co-occurence of the words, we need to select a center word and a context word in each iteration, thus, a combination of two loops is needed. 

In [12]:
co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))

for s in corpus_test:
  for i in range(len(s)):
    word_i = s[i]
    pos_i = vocab_index[word_i]
    for j in range(max(i-5,0), min(i+6,len(s))):
      if j!= i:
        word_j = s[j]
        pos_j = vocab_index[word_j]
        co_occurrence_matrix[pos_i][pos_j] = co_occurrence_matrix[pos_i][pos_j] + 0.5/abs(i-j)
        co_occurrence_matrix[pos_j][pos_i] = co_occurrence_matrix[pos_i][pos_j]

co_occurrence_matrix = np.matrix(co_occurrence_matrix)
co_occurrence_matrix

matrix([[0.5       , 0.53333333, 2.        , 0.75      , 1.58333333,
         0.75      , 1.33333333],
        [0.53333333, 0.        , 0.25      , 1.        , 1.        ,
         0.        , 0.5       ],
        [2.        , 0.25      , 0.        , 0.33333333, 0.7       ,
         0.33333333, 0.5       ],
        [0.75      , 1.        , 0.33333333, 0.        , 0.7       ,
         0.        , 1.        ],
        [1.58333333, 1.        , 0.7       , 0.7       , 0.        ,
         1.        , 0.58333333],
        [0.75      , 0.        , 0.33333333, 0.        , 1.        ,
         0.        , 0.2       ],
        [1.33333333, 0.5       , 0.5       , 1.        , 0.58333333,
         0.2       , 0.        ]])