In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
def tokenize(corpus : str) -> list:
    tokens = []
    for sentence in corpus:
        tokens.append(sentence.split())
    return tokens

In [3]:
def generate_center_context_pair(tokens, window: int) -> dict:
    pairs = dict()
    for row in tokens:
        for idx, center_word in enumerate(row):
            pairs.setdefault(center_word, [])
            for i in range(idx - window, idx + window + 1):
                if (i >= 0 and i != idx and i < len(row)):
                    pairs[center_word].append(row[i])
    return pairs


In [4]:
def generate_jdd(cc_pair: dict) -> list:
    jdd = []
    for key in cc_pair.keys():
        for item in cc_pair[key]:
            jdd.append([item, key])
    return jdd

In [52]:
corpus = [
    
    
        "he is karim",
        "I know him",
        "he loves you",
        "he loves me",
    
        #"he is a king",
        #"she is a queen",
        #"he is a man",
        #"she is a woman",
        #"warsaw is poland capital",
        #"berlin is germany capital",
        #"paris is france capital",
        # "Sxi este juna kaj bela",
]

In [57]:
def main():
    pprint(corpus)

    tokens = tokenize(corpus)
    cc_pair = generate_center_context_pair(tokens, 2)

    #pprint(cc_pair)

    jdd = np.asarray(generate_jdd(cc_pair))
    jdd = pd.DataFrame({'center': jdd[:, 1], 'context': jdd[:, 0]})
    print("Joint Distribution Table")
    print(jdd)
    
    print("Total number of row ")
    sample_space = jdd.shape[0]
    print (sample_space)
    print("\n")
    
    print("Number of occurance word in center")
    a = np.asarray(jdd['center'].value_counts())
    print(a)
    
    print("\n")
    print("P(he)")
    prob = (a[0] / sample_space)
    print(round(prob, 2)) 
    
    print("P(is|he)")
    prob1 = (1 / a[0])
    print(round(prob1, 2))
    
    print("P(karim|he)")
    prob2 = (1 / a[0])
    print(round(prob2, 2))
    
    print("P(loves|he)")
    prob3 = (2 / a[0])
    print(round(prob3, 2))
    
    print("P(you|he)")
    prob4 = (1 / a[0])
    print(round(prob4, 2))
    
        

if __name__ == "__main__":
    main()


['he is karim', 'I know him', 'he loves you', 'he loves me']
Joint Distribution Table
   center context
0      he      is
1      he   karim
2      he   loves
3      he     you
4      he   loves
5      he      me
6      is      he
7      is   karim
8   karim      he
9   karim      is
10      I    know
11      I     him
12   know       I
13   know     him
14    him       I
15    him    know
16  loves      he
17  loves     you
18  loves      he
19  loves      me
20    you      he
21    you   loves
22     me      he
23     me   loves
Total number of row 
24


Number of occurance word in center
[6 4 2 2 2 2 2 2 2]


P(he)
0.25
P(is|he)
0.17
P(karim|he)
0.17
P(loves|he)
0.33
P(you|he)
0.17
