# Week 12

In [99]:
from alg_to_t import estimate_tef_and_tfe
import kenlm
import csv
from math import isclose, exp, log10
from collections import defaultdict
from typing import List
import copy

In [100]:
from collections import namedtuple

In [101]:
print('loading the language model ...')
lm = kenlm.Model('./bnc.bin')
print('You can use variable "lm" to score a sentence.')
print('\n' + '=' * 100 + '\n')


parallel_tokens_path = './cambridge_fast_align_corpus.txt'
parallel_tokens = []
print('preparing the data ...')
with open(parallel_tokens_path, 'r', encoding='utf8') as f:
    parallel_corpus = f.read().split('\n')
    for sent_pair in parallel_corpus:
        en_sent, ch_sent = sent_pair.split(' ||| ')
        en_tokens = en_sent.split(' ')
        ch_tokens = ch_sent.split(' ')
        parallel_tokens.append((en_tokens, ch_tokens))

print('Here are examples of the data.')
for x in parallel_tokens[:3]:
    print(x)
print('\n' + '=' * 100 + '\n')


with open('./cambridge_sym_en-ch.align', 'r') as f:
    en_ch_alignments = f.read().split('\n')[:-1]


# print('estimating t(e|f) and t(f|e) ...')
# t_ef, t_fe = estimate_tef_and_tfe(parallel_tokens, en_ch_alignments)
t_ef = defaultdict(lambda: defaultdict(lambda: 0.0))
t_fe = defaultdict(lambda: defaultdict(lambda: 0.0))
p_g_ef = defaultdict(lambda: defaultdict(lambda: 0.0))
p_g_fe = defaultdict(lambda: defaultdict(lambda: 0.0))
with open('word_table.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        w_e = row['ephrase']
        w_c = row['cphrase']
        t_ef[w_e][w_c] = float(row['pec'])
        t_fe[w_c][w_e] = float(row['pce'])
        p_g_fe[w_c][w_e] = row['pec']
        p_g_ef[w_e][w_c] = row['pce']
print('You can use variables "t_ef" denoting t(e|f) and "t_fe" denoting t(f|e).')

loading the language model ...
You can use variable "lm" to score a sentence.


preparing the data ...
Here are examples of the data.
(['i', "'ve", 'bought', 'a', 'car', '.'], ['我', '買', '了', '輛', '汽車', '。'])
(['she', "'s", 'got', 'a', 'boyfriend', '.'], ['她', '交', '了', '個', '男朋友', '。'])
(['there', 'was', 'a', 'sudden', 'loud', 'noise', '.'], ['突然', '發出', '一', '聲', '巨響', '。'])


You can use variables "t_ef" denoting t(e|f) and "t_fe" denoting t(f|e).


In [102]:
def get_conditional_prob(token_ch, translation_dict):
    prob_dict = {}
    for token_en, sub_dict in translation_dict.items():
        if token_ch in sub_dict:
            prob_dict[token_en] = sub_dict[token_ch]
    return prob_dict

def check_top_k_prob(token_ch, k, translation_dict):
    prob_dict = get_conditional_prob(token_ch, translation_dict)
    prob_list = list(prob_dict.items())
    prob_sum = sum(prob_dict.values())
    prob_list.sort(key=lambda x:x[1], reverse=True)

    if not isclose(prob_sum, 1.):
        print(f"Warning. sum of p( e | {token_ch}) = {prob_sum}, not close to 1.")

    for token_en, prob in prob_list[:k]:
        print(f"p({token_en} | {token_ch}) = {prob}")

In [103]:
check_top_k_prob('遺憾', 10, t_ef)

p(regret | 遺憾) = 0.3256876796844739
p(sorry | 遺憾) = 0.16972485822707165
p(regrettable | 遺憾) = 0.15596337329270898
p(regrets | 遺憾) = 0.10091743355525822
p(regrettably | 遺憾) = 0.05045871677762911
p(regretful | 遺憾) = 0.05045871677762911
p(unfortunately | 遺憾) = 0.041284454803088066
p(disappointed | 遺憾) = 0.013761484934362687
p(pity | 遺憾) = 0.013761484934362687
p(regretted | 遺憾) = 0.013761484934362687


# level 1: paraphrasing with word-based translation

## Paraphrasing by controlling word sense
我們在作業說明中有介紹到，一種 control word sense 的 paraphrasing 方法。而我們的資料是 parallel corpus，所以可以使用這個方法。
接下來的 level A 就會一步一步地讓你們把這個方法實作出來。  
首先我們需要根據給定的 $e_1$, $e_2$ 以及它們之間的 anchor $f$，使用 translation probability distribution $t(e|f)$ and $t(f|e)$ 算出 $e1$ 與 $e_2$ 的 paraphrase probability。  
然後我們就能算出 $e_1$ 與其他所有不是 $e_1$ 的 english word 的 paraphrase probability，並取最高的前 5 名。  
接著我們需要用 language model 來衡量句子把 $e_1$ 替換成候選人之後的 score，score 越高代表越合適。但這邊的 score 算出來是負的，需要先取 exponential。然後將 score 乘上 paraphrase probability，並將候選人按照乘積由高到低重新排序。  
最後我們就能用前面實作的各個 function 將 paraphrasing 實作出來。

## Compute paraphrase probability

In [105]:
# example
example_en_tokens = ["I", "suggest", "this", "method"]
example_ch_tokens = ["我", "建議", "這個", "方法"]
example_e1_idx = 1
example_e1 = example_en_tokens[example_e1_idx]
example_f_idx = 1
example_f = example_ch_tokens[example_f_idx]
example_e2 = 'proposed'

print(example_en_tokens)
print(example_ch_tokens)
print(example_e1_idx, example_e1)
print(example_f_idx, example_f)
print(example_e2)

['I', 'suggest', 'this', 'method']
['我', '建議', '這個', '方法']
1 suggest
1 建議
proposed


In [106]:
test_idx = 1
e1_idx = 1
en_tokens, ch_tokens = example_en_tokens, example_ch_tokens
print(en_tokens, ch_tokens, sep="\n", end="\n\n")

test_paraphrasing_half(example_en_tokens, example_ch_tokens, e1_idx, example_f_idx, t_ef, t_fe)

['I', 'suggest', 'this', 'method']
['我', '建議', '這個', '方法']

before paraphrasing: I ```suggest``` this method
after paraphrasing: I ```proposed``` this method


In [107]:
def compute_paraphrase_prob(e1: str,
                            e2: str,
                            f: str,
                            t_ef: defaultdict,
                            t_fe: defaultdict):
    """
    Args:
        e1: 待替換的單字
        e2: 用來替換的單字
        f: e1 與 e2 替換用的 anchor，e1 => f => e2
        t_ef: t(e|f)
        t_fe: t(f|e)

    Returns:
        prob: e1 => f => e2 的 paraphrase probability
    """
    prob = 0.
    #### YOUR CODE HERE ####
    
    return prob

In [108]:
""" result
.22497
"""
compute_paraphrase_prob(example_e1, example_e2, example_f, t_ef, t_fe)

0.22497118206479705

## Collect candidates for paraphrase

In [109]:
def get_first_5_paraphrases(e1: str,
                            f: str,
                            t_ef: defaultdict,
                            t_fe: defaultdict):
    """
    Args:
        e1: 待替換的單字
        f: anchor
        t_ef: t(e|f)
        t_fe: t(f|e)

    Returns:
        candidates: paraphrase probability 前 5 高的其他英文單字，以及它們對應的
        paraphrase probability。

        example:
        [(c1, paraphrase_prob1),
         ... ,
         (c5, paraphrase_prob5)]
    """
    candidates = []#[('', 0.) for i in range(5)]
    #### YOUR CODE HERE ####
    
    return candidates

In [110]:
""" result
[('proposed', 0.22497118206479705),
 ('proposals', 0.1329054475991522),
 ('proposal', 0.11513612557416811),
 ('recommendations', 0.07940394719871861),
 ('propose', 0.045689473608494055)]
"""
get_first_5_paraphrases(example_e1, example_f, t_ef ,t_fe)

[('proposed', 0.22497118206479705),
 ('proposals', 0.1329054475991522),
 ('proposal', 0.11513612557416811),
 ('recommendations', 0.07940394719871861),
 ('propose', 0.045689473608494055)]

## Re-rank candidates for paraphrase

In [111]:
def rerank_paraphrases(en_tokens: List[str],
                       e1_idx: int,
                       candidates: List):
    """
    Args:
        en_tokens: 英文句子的 tokens (words)
        e1_idx: 待替換的單字在 en_tokens 中的 index
        candidates: get_first_5_paraphrases 的 output

    Returns:
        candidates_rerank: [c1, c2, ..., c5]，按照它們的 paraphrase probability 與
        exponential of sentence probability 的乘積排序，由高到低。
    """
    candidates_rerank = []
    #### YOUR CODE HERE ####
    
    
    return candidates_rerank

In [113]:
""" result
['proposed', 'propose', 'proposals', 'proposal', 'recommendations']
"""

example_candidates = get_first_5_paraphrases(example_e1, example_f, t_ef ,t_fe)
rerank_paraphrases(example_en_tokens, example_e1_idx, example_candidates)

['proposed', 'propose', 'proposals', 'proposal', 'recommendations']

## Paraphrasing given a ch_token

In [114]:
def paraphrasing(en_tokens: List[str],
                 ch_tokens: List[str],
                 e1_idx: int,
                 f_idx: int,
                 t_ef: defaultdict,
                 t_fe: defaultdict):
    """
    Args:
        en_tokens: 英文句子的 tokens (words)
        ch_tokens: 對應的中文句子的 tokens (斷詞)
        e1_idx: 待替換的單字在 en_tokens 中的 index
        f_idx: 與 e1 align 的中文斷詞在 ch_tokens 中的 index
        t_ef: t(e|f)
        t_fe: t(f|e)

    Returns:
        e2: rerank 之後第一名的 candidate
    """
    e2 = ''
    #### YOUR CODE HERE ####
    

    return e2

In [116]:
""" result
'proposed'
"""

paraphrasing(example_en_tokens, example_ch_tokens, example_e1_idx, example_f_idx, t_ef, t_fe)

'proposed'

In [117]:
def test_paraphrasing_half(en_tokens, ch_tokens, e1_idx, f_idx, t_ef, t_fe):
    e2 = paraphrasing(en_tokens, ch_tokens, e1_idx, f_idx, t_ef, t_fe)
    print('before paraphrasing:', highlight_word(en_tokens, e1_idx))
    print('after paraphrasing:', highlight_word(en_tokens[:e1_idx] + [e2] + en_tokens[(e1_idx+1):],
                                                e1_idx))
    return

In [118]:
def test_paraphrasing(en_tokens, ch_tokens, e1_idx, alg_of_sent, t_ef, t_fe):
    idx_map = defaultdict(lambda: [])
    for en_ch_alg in alg_of_sent.split(' '):
        en_idx, ch_idx = list(map(lambda x: int(x), en_ch_alg.split('-')))
        idx_map[en_idx].append(ch_idx)
    f_idx = idx_map[e1_idx][0]
    e2 = paraphrasing(en_tokens, ch_tokens, e1_idx, f_idx, t_ef, t_fe)
    print('before paraphrasing:', highlight_word(en_tokens, e1_idx))
    print('after paraphrasing:', highlight_word(en_tokens[:e1_idx] + [e2] + en_tokens[(e1_idx+1):],
                                                e1_idx))
    return


def highlight_word(en_tokens, highlight_idx):
    word = f'```{en_tokens[highlight_idx]}```'
    sent = ' '.join(en_tokens[:highlight_idx] + [word] + en_tokens[(highlight_idx+1):])
    return sent

In [119]:
test_idx = 2001
e1_idx = 9
en_tokens, ch_tokens = copy.deepcopy(parallel_tokens[test_idx])
print(en_tokens, ch_tokens, sep="\n", end="\n\n")

test_paraphrasing(en_tokens, ch_tokens, e1_idx, en_ch_alignments[test_idx], t_ef, t_fe)

['shall', 'i', 'put', 'on', 'a', 'dvd', 'to', 'amuse', 'the', 'kids', '?']
['要不要', '給', '孩子', '放段', '影片', '，', '逗', '他們', '開心', '？']

before paraphrasing: shall i put on a dvd to amuse the ```kids``` ?
after paraphrasing: shall i put on a dvd to amuse the ```children``` ?


# General paraphrasing
有時候我們想做 paraphrasing 的英文句子不一定會有像 parallel corpus 一樣的對應 foreign sentence，所以更 general 的做法應該是考慮所有的 foreign word 與 English word 的 parapharse probability。  
關於這個作法的介紹已經寫在作業說明當中，請自行參考它並實作下面給出的 function。

In [120]:
def get_e1_to_e2_prob_dict(e1, t_ef, t_fe):
    # update dict p such that p[e2] = t(e2 | e1) 
    # NOTE : p[e1] = t(e1 | e1) should NOT be zero
    p = defaultdict(lambda: 0.0) 
    #### CODE HERE ####
    
    return p

In [124]:
def general_paraphrasing(en_tokens, e1_idx, t_ef, t_fe):
    e2 = ''
    #### YOUR CODE HERE ####
    
    return e2

In [125]:
def test_general_paraphrasing(en_tokens, e1_idx, t_ef, t_fe):
    e2 = general_paraphrasing(en_tokens, e1_idx, t_ef, t_fe)
    print('before paraphrasing:', highlight_word(en_tokens, e1_idx))
    print('after paraphrasing:', highlight_word(en_tokens[:e1_idx] + [e2] + en_tokens[(e1_idx+1):],
                                                e1_idx))
    return

In [126]:
print(example_en_tokens, example_e1_idx)
general_paraphrasing(example_en_tokens, example_e1_idx, t_ef, t_fe)

['I', 'suggest', 'this', 'method'] 1


'proposed'

In [127]:
"""

"""
print(example_en_tokens)
print(general_paraphrasing(example_en_tokens, example_e1_idx, t_ef, t_fe), end='\n\n')

test_general_paraphrasing(example_en_tokens[:], example_e1_idx, t_ef, t_fe)

['I', 'suggest', 'this', 'method']
proposed

before paraphrasing: I ```suggest``` this method
after paraphrasing: I ```proposed``` this method


## Decoding with noisy channel model
利用以上程式，對整句話進行Paraphrasing

In [128]:
class ParaphrasingModel():
    def __init__(self, t_ef, t_fe, prob_function):
        self.t_ef = t_ef
        self.t_fe = t_fe
        self.p_func = prob_function
        self.phrase_set = set(t_ef.keys())
        
    def __getitem__(self, phrase_in):
        # phase_in should be a tuple of words e.g. (word,)
        assert(type(phrase_in) == tuple)
        if len(phrase_in) == 1 and phrase_in not in self.phrase_set: # unknown word, OOV
            p = defaultdict(lambda: 0.0)
            p[phrase_in] = 1.
            return p
        
        return self.p_func(phrase_in, self.t_ef, self.t_fe)
            

In [129]:
# keys of t_ef and t_fe are words. We hope that the keys are tuples.
tt_ef = defaultdict(lambda:defaultdict(lambda:0.0))
for e in t_ef:
    for f in t_ef[e]:
        tt_ef[(e,)][(f,)] = t_ef[e][f]

tt_fe = defaultdict(lambda:defaultdict(lambda:0.0))
for f in t_fe:
    for e in t_fe[f]:
        tt_fe[(f,)][(e,)] = t_fe[f][e]

In [130]:
tm = ParaphrasingModel(tt_ef, tt_fe, get_e1_to_e2_prob_dict)

In [139]:

def noisy_channel_decode(sent_in, beam_size, tm, lm, verbose = False, bos=True):
    # The following code implements a monotone decoding
    # algorithm (one that doesn't permute the target phrases).
    # Hence all hypotheses in stacks[i] represent translations of 
    # the first i words of the input sentence. You should generalize
    # this so that they can represent translations of *any* i words.
    hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase")
    state = kenlm.State()
    if bos:
        lm.BeginSentenceWrite(state) #Use <s> as context.  If you don't want <s>, use lm.NullContextWrite(state).
    else:
        lm.NullContextWrite(state)
    initial_hypothesis = hypothesis(0.0, state, None, None)
    # stacks = [{} for _ in sent_in] + [{}]
    # stacks[0][state] = initial_hypothesis
    stacks = [[] for _ in sent_in] + [[]]
    stacks[0].append(initial_hypothesis)
    for ii, stack in enumerate(stacks[:-1]):
    #     for h in sorted(stack.values(),key=lambda h: -h.logprob)[:beam_size]: # prune
        print("\nnow", ii)
        for h in sorted(stack,key=lambda h: -h.logprob)[:beam_size]: # prune
            for j in range(ii+1,len(sent_in)+1):
                print(h.phrase, ii, j, end='\t')
    #             print(sent_in[ii:j])
                if sent_in[ii:j] in tm.phrase_set:
    #                 for phrase_new, trans_prob in tm[sent_in[i:j]].items():
                    for phrase_new, trans_prob in sorted(tm[sent_in[ii:j]].items(), key=lambda x:-x[1])[:2*beam_size]:
                        # print(phrase_new)
                        logprob = h.logprob + log10(trans_prob)
                        lm_state = h.lm_state
                        for word in phrase_new:
                            lm_state_new = kenlm.State()
                            word_logprob = lm.BaseScore(lm_state, word, lm_state_new)
                            lm_state = lm_state_new
                            logprob += word_logprob
                        logprob += lm.BaseScore(lm_state, "</s>", kenlm.State()) if j == len(sent_in) else 0.0
                        new_hypothesis = hypothesis(logprob, lm_state, h, phrase_new)
    #                     if lm_state not in stacks[j] or stacks[j][lm_state].logprob < logprob: # second case is recombination
    #                         stacks[j][lm_state] = new_hypothesis
                        stacks[j].append(new_hypothesis)
    # winner = max(stacks[-1].values(), key=lambda h: h.logprob)
    print("\n\nRESULT:")
    result = []
    for winner in sorted(stacks[-1],key=lambda h: -h.logprob)[:beam_size]:
        def extract_english(h): 
            return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), " ".join(h.phrase))
        sent_out = extract_english(winner)
        print(sent_out)
        result.append(sent_out)

        if verbose:
    #         def extract_tm_logprob(h):
    #             return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
    #         tm_logprob = extract_tm_logprob(winner)
            lm_score = print(model.score(sent_out, bos = True, eos = True))
            sys.stderr.write("LM = %f, TM = %f, Total = %f\n" % 
              (winner.logprob - tm_logprob, tm_logprob, winner.logprob))
    #           (winner.logprob - tm_logprob, tm_logprob, winner.logprob))
        print()
    return result

In [140]:
""" RESULT:

"""

beam_size = 5
sent_in = tuple(example_en_tokens)
print(" ".join(example_en_tokens))
sent_list = noisy_channel_decode(("i", "suggest", "this", "method"), beam_size, tm, lm, bos = False)
# sent_list = noisy_channel_decode((), beam_size, tm, lm)

I suggest this method

now 0
None 0 1	None 0 2	None 0 3	None 0 4	
now 1
('i',) 1 2	('i',) 1 3	('i',) 1 4	('my',) 1 2	('my',) 1 3	('my',) 1 4	('in',) 1 2	('in',) 1 3	('in',) 1 4	('me',) 1 2	('me',) 1 3	('me',) 1 4	('we',) 1 2	('we',) 1 3	('we',) 1 4	
now 2
('proposals',) 2 3	('proposals',) 2 4	('proposal',) 2 3	('proposal',) 2 4	('proposed',) 2 3	('proposed',) 2 4	('suggestion',) 2 3	('suggestion',) 2 4	('suggestions',) 2 3	('suggestions',) 2 4	
now 3
('the',) 3 4	('the',) 3 4	('that',) 3 4	('the',) 3 4	('that',) 3 4	

RESULT:
my suggestion that way 

my proposal that way 

my proposed the way 

my proposal the way 

my proposals the way 



# level 2 : paraphrasing with phrase-based translation
TODO:
1. 使用中英句子對```parallel_tokens```和 word alignment ```en_ch_alignments```  
   以consistent block的概念計算phrase table，  
   也就是phrase英翻中和中翻英的機率:  
   t(phrase_ch | phrase_en), t(phrase_en | phrase_ch)  
   
2. 計算以phrase的Paraphrasing的機率 t(phrase_en_2 | phrase_en_1)  
3. 利用函式noisy_channel_decode查看phrase-based的Paraphrasing的效果  

In [34]:
print(parallel_tokens[0])
print(en_ch_alignments[0])

(['i', "'ve", 'bought', 'a', 'car', '.'], ['我', '買', '了', '輛', '汽車', '。'])
0-0 1-0 2-1 2-2 3-3 4-4 5-5


In [35]:
import consistent_block

In [36]:
def get_pair(align_str):
    return [tuple(int(idx)for idx in a.split("-")) for a in align_str.split()]

In [37]:
get_pair(en_ch_alignments[1])

[(0, 0), (1, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]

In [38]:
len(parallel_tokens), len(en_ch_alignments)

(65418, 65418)

In [None]:
#### CODE HERE ####
