#<font color='#2ecc71' > Co-Occurence Matrix <br> Frequency-based Word Embedding

In [None]:
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd

###<font color='#2ecc71' > Define function to loop through bigrams - recording the current and next words. <br> Then calculate the number of occurences of the bigram. <br> [NLTK probability Module](http://www.nltk.org/api/nltk.html?highlight=freqdist) 

In [None]:
x = set(("apple", "banana", "cherry"))

print(x)

The set() function creates a set object.

The items in a set list are unordered, so it will appear in random order.

In [None]:
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
 
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index
 

The FreqDist class is used to encode “frequency distributions”, which count the number of times that each outcome of an experiment occurs.

#<font color='#2ecc71' > Create Dataframe & Calculate Similariry Scores Between 2 Words - Pass Corpus Into Function Defined Above. 

In [None]:
corpus = [['penny', 'wise', 'penny', 'foolish'],
             ['a', 'penny', 'saved', 'is', 'a','penny','earned']]
 
# Create one list using many lists
mylist = list(itertools.chain.from_iterable(corpus))
matrix, vocab_index = generate_co_occurrence_matrix(mylist)
 
 
co_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
print(co_matrix)

         earned  penny  saved   is    a  wise  foolish
earned      0.0    1.0    0.0  0.0  0.0   0.0      0.0
penny       0.0    0.0    0.0  0.0  2.0   1.0      0.0
saved       0.0    1.0    0.0  0.0  0.0   0.0      0.0
is          0.0    0.0    1.0  0.0  0.0   0.0      0.0
a           0.0    0.0    0.0  1.0  0.0   0.0      1.0
wise        0.0    1.0    0.0  0.0  0.0   0.0      0.0
foolish     0.0    1.0    0.0  0.0  0.0   0.0      0.0
