# N-gram Language Modeling

## Data

In [35]:
corpus = [
    "오늘 날씨 어때",
    "오늘 날씨 어때",
    "오늘 날씨 어때",
    "오늘 축구 경기",
    "오늘 경기 결과",
    "오늘 경기 결과",
    "내일 날씨",
    "내일 축구 경기",
    "내일 축구 경기",
    "축구 일정"
]

## Tokenization

In [36]:
vocabs = set()
for sample in corpus:
    for token in sample.split():
        vocabs.add(token)

In [37]:
vocabs = list(vocabs)

In [38]:
vocabs

['내일', '오늘', '결과', '날씨', '어때', '경기', '축구', '일정']

In [39]:
idx_to_token = vocabs
token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}

In [40]:
token_to_idx

{'내일': 0, '오늘': 1, '결과': 2, '날씨': 3, '어때': 4, '경기': 5, '축구': 6, '일정': 7}

---
## Bi-gram Language Modeling

In [41]:
prev_token_to_next_token_cnt_table = {}

In [42]:
total_cnt = 0

for sample in corpus:
    tokens = sample.split()

    for prev_token, next_token in zip(tokens[:-1], tokens[1:]):
        if prev_token not in prev_token_to_next_token_cnt_table:
            prev_token_to_next_token_cnt_table[prev_token] = {}
        if next_token not in prev_token_to_next_token_cnt_table[prev_token]:
            prev_token_to_next_token_cnt_table[prev_token][next_token] = 0
        prev_token_to_next_token_cnt_table[prev_token][next_token] += 1
        total_cnt += 1

In [43]:
prev_token_to_next_token_cnt_table

{'오늘': {'날씨': 3, '축구': 1, '경기': 2},
 '날씨': {'어때': 3},
 '축구': {'경기': 3, '일정': 1},
 '경기': {'결과': 2},
 '내일': {'날씨': 1, '축구': 2}}

In [44]:
import pandas as pd

In [45]:
prev_token_to_next_token_table_df = pd.DataFrame(prev_token_to_next_token_cnt_table).transpose()
prev_token_to_next_token_table_df.fillna(0,inplace=True)

In [46]:
prev_token_to_next_token_table_df

Unnamed: 0,날씨,축구,경기,어때,일정,결과
오늘,3.0,1.0,2.0,0.0,0.0,0.0
날씨,0.0,0.0,0.0,3.0,0.0,0.0
축구,0.0,0.0,3.0,0.0,1.0,0.0
경기,0.0,0.0,0.0,0.0,0.0,2.0
내일,1.0,2.0,0.0,0.0,0.0,0.0


In [47]:
prev_token_to_next_token_table_df.loc['오늘', '경기']

2.0

---
## P(prev_token, next_token) Joint Prob Distribution

In [48]:
prev_token_to_next_token_join_prob_table_df = prev_token_to_next_token_table_df / total_cnt

In [49]:
sum(prev_token_to_next_token_join_prob_table_df.sum())

1.0

In [50]:
prev_token_to_next_token_join_prob_table_df

Unnamed: 0,날씨,축구,경기,어때,일정,결과
오늘,0.166667,0.055556,0.111111,0.0,0.0,0.0
날씨,0.0,0.0,0.0,0.166667,0.0,0.0
축구,0.0,0.0,0.166667,0.0,0.055556,0.0
경기,0.0,0.0,0.0,0.0,0.0,0.111111
내일,0.055556,0.111111,0.0,0.0,0.0,0.0


---
## P(next_token|prev_token) Conditional Prob Distribution

In [51]:
marginal_prob = prev_token_to_next_token_join_prob_table_df.sum(axis=1)

In [52]:
marginal_prob 

오늘    0.333333
날씨    0.166667
축구    0.222222
경기    0.111111
내일    0.166667
dtype: float64

In [53]:
prev_token_to_next_token_cond_prob_table_df = prev_token_to_next_token_join_prob_table_df.copy()

In [54]:
for row_name, row in prev_token_to_next_token_join_prob_table_df.iterrows():
    for col_name, val in row.items():
        prev_token_to_next_token_cond_prob_table_df.loc[row_name, col_name] /= marginal_prob[row_name]

In [55]:
prev_token_to_next_token_cond_prob_table_df

Unnamed: 0,날씨,축구,경기,어때,일정,결과
오늘,0.5,0.166667,0.333333,0.0,0.0,0.0
날씨,0.0,0.0,0.0,1.0,0.0,0.0
축구,0.0,0.0,0.75,0.0,0.25,0.0
경기,0.0,0.0,0.0,0.0,0.0,1.0
내일,0.333333,0.666667,0.0,0.0,0.0,0.0


In [56]:
prev_token_to_next_token_cond_prob_table_df.loc["오늘", "날씨"]

0.5

In [57]:
prev_token_to_next_token_cond_prob_table_df.loc["경기", "결과"]

1.0