In [46]:
import numpy as np
import pandas as pd
from pprint import pprint

In [47]:
# Function defs

def tokenize(corpus : str) -> list:
    tokens = []
    for sentence in corpus:
        tokens.append(sentence.split())
    return tokens

In [48]:
def word_to_index(tokens):
        vocabulary = []
        for sentence in tokens:
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
        word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}    
        return word2idx


In [49]:
def index_to_word(tokens):
        vocabulary = []
        for sentence in tokens:
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)

        idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}    
        return idx2word

In [50]:
def generate_center_context_pair(tokens, window: int) -> dict:
    pairs = dict()
    for row in tokens:
        for idx, center_word in enumerate(row):
            pairs.setdefault(center_word, [])
            for i in range(idx - window, idx + window + 1):
                if (i >= 0 and i != idx and i < len(row)):
                    pairs[center_word].append(row[i])
    return pairs

In [51]:
def generate_jdt(cc_pair: dict) -> list:
    jdt = []
    for center in cc_pair.keys():
        for context in cc_pair[center]:
            jdt.append([center, context])
    return jdt

In [52]:
def all_p_of_context_given_center(joint_distrib_table: pd.DataFrame):
    counts = joint_distrib_table.groupby(['center', 'context']).size()
    counts = counts.to_dict()

    # Denominator for the probability
    total = joint_distrib_table.groupby('center').size()
    total = total.to_dict()

    for center in total.keys():
        for k in list(counts.keys()):
            if k[0] is center:
                counts[k] = [counts[k]]
                counts[k].append(total[center])

    return counts

In [53]:
corpus = [
        "he is a king",
        "she is a queen",
        "he is a man",
        "she is a woman",
        "warsaw is poland capital",
        "berlin is germany capital",
        "paris is france capital",
        # "Sxi este juna kaj bela",
]

In [54]:
def main():
    pprint(corpus)

    tokens = tokenize(corpus)
    cc_pair = generate_center_context_pair(tokens, 2)

    pprint(cc_pair)

    global jdt
    jdt = np.asarray(generate_jdt(cc_pair))
    jdt = pd.DataFrame({'center': jdt[:, 0], 'context': jdt[:, 1]})
    print("Joint Distribution Table")
    print(jdt[:10])

    cc_pair_counts = all_p_of_context_given_center(jdt)
    pprint(cc_pair_counts) 
    wordtoindex = word_to_index(tokens)
    indextoword = index_to_word(tokens)
    print(wordtoindex)
    print(indextoword)
if __name__ == "__main__":
    main()


['he is a king',
 'she is a queen',
 'he is a man',
 'she is a woman',
 'warsaw is poland capital',
 'berlin is germany capital',
 'paris is france capital']
{'a': ['he',
       'is',
       'king',
       'she',
       'is',
       'queen',
       'he',
       'is',
       'man',
       'she',
       'is',
       'woman'],
 'berlin': ['is', 'germany'],
 'capital': ['is', 'poland', 'is', 'germany', 'is', 'france'],
 'france': ['paris', 'is', 'capital'],
 'germany': ['berlin', 'is', 'capital'],
 'he': ['is', 'a', 'is', 'a'],
 'is': ['he',
        'a',
        'king',
        'she',
        'a',
        'queen',
        'he',
        'a',
        'man',
        'she',
        'a',
        'woman',
        'warsaw',
        'poland',
        'capital',
        'berlin',
        'germany',
        'capital',
        'paris',
        'france',
        'capital'],
 'king': ['is', 'a'],
 'man': ['is', 'a'],
 'paris': ['is', 'france'],
 'poland': ['warsaw', 'is', 'capital'],
 'queen': ['is', '