# Conversion from Simplified to Traditional Chinese

Imports and loading of files. For LSTM, we use embedding size of 1500, 1 layer and the dropout of 0.65.

In [1]:
import argparse
import time
import math
import os
import torch
import torch.nn as nn

import data
import model

In [2]:
import dill
from chunk_tokenizer import *

In [3]:
corpus = dill.load(open('corpusfile','rb'))

In [4]:
device = torch.device("cuda")
ntokens = len(corpus.dictionary)
model = model.RNNModel('LSTM', ntokens, 1500, 1500, 1, 0.65, True).to(device)
logging.basicConfig(level=logging.ERROR)

  "num_layers={}".format(dropout, num_layers))


In [5]:
with open('lstm_1.pt', 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()



Below function uses Dictionary based BPE on the fly in case LSTM encounters an OOV word.

In [7]:
def divide_further(word):
    text, charList = prepare(word)
    token_list = []
    input_text = text
    n = len(word)
    for k in range(n, 0, -1):
        candidates = [input_text[i:i + k] for i in range(len(input_text) - k + 1)]
        for candidate in candidates:
            if candidate in corpus.dictionary.word2idx:
                token_list.append(candidate)
                input_text = re.sub(candidate, '', input_text)
    remaining = list(input_text)
    for x in remaining:
        token_list.append(x)
    final = sequencer(token_list, text)
    return final

The below function prepares the inference sentence for LSTM. It's general function is to convert the words into encoded forms. If a word is not in encodings, it calls `divide_further` for Dictionary based BPE.

In [8]:
def words_to_ids(sent):
    unknown = []
    words = create_mappings(sent)
    sentence_idx = []
    for count, word in enumerate(words):
        index_list = []
        subdivided_index_list = []
        flag = 0
        for candidate in word:
            subdivision = []
            idx = 'UNK'
            if candidate in corpus.dictionary.word2idx:
                idx = corpus.dictionary.word2idx[candidate]
            if idx == 'UNK':
                if len(candidate) > 1:
                    flag = 1
                    more_words = divide_further(candidate)
                    for w in more_words:
                        if w in corpus.dictionary.word2idx:
                            idx = corpus.dictionary.word2idx[w]
                            subdivision.append(idx)
                        else:
                            subdivision.append(corpus.dictionary.word2idx['F'])
                            unknown.append(w)
                else:
                    if len(word) == 1:
                        subdivision.append(corpus.dictionary.word2idx['F'])
                        unknown.append(candidate)
            else:
                index_list.append(idx)
            if subdivision:
                subdivided_index_list.append(subdivision)
        if flag == 0:
            if len(index_list) == 0:
                index_list.append(corpus.dictionary.word2idx['F'])
                unknown.append(word[0])
            sentence_idx.append(index_list)
        else:
            flag = 0
            transpose = list(map(list, zip(*subdivided_index_list)))
            for x in transpose:
                sentence_idx.append(x)
    
    sanity_check_sent = "".join([corpus.dictionary.idx2word[i[0]] for i in sentence_idx])
    logging.debug("Number of chars in LSTM are: " + str(len(sanity_check_sent)))
    logging.debug("Number of unknowns are: " + str(len(unknown)))
    return sentence_idx, unknown

# Greedy Decoding

The below function does greedy decoding for disambiguation.

![caption](lstm.png)

In [9]:
def greedy_decoding(sentence_idx):
    final_sent = []
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    input.fill_(0)
    hidden = model.init_hidden(1)
    output, hidden = model(input, hidden)
    word_weights = []
    for word_idx in sentence_idx:
        if len(word_idx) > 1:
            max_prob = -10.0
            max_idx = -9
            for idx in word_idx:
                if len(word_weights):
                    prob = word_weights[idx].item()
                else:
                    input.fill_(corpus.dictionary.word2idx['F'])
                    hidden = model.init_hidden(1)
                    output, hidden = model(input, hidden)
                    word_weights = output.squeeze().div(1.0).exp().cpu()
                    prob = word_weights[idx].item()
                if prob > max_prob:
                    max_prob = prob
                    max_idx = idx
            final_sent.append(max_idx)
            input.fill_(max_idx)
        else:
            try:
                input.fill_(word_idx[0])
            except:
                print(sentence_idx)
            final_sent.append(word_idx[0])
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().div(1.0).exp().cpu()
    return final_sent

Below is the UNK alignment function.

In [10]:
def decode_unks(trad_sent, unks, sent):
    tokens = list(trad_sent)
    new_tokens = []
    count = 0
    for t in tokens:
        if t == "F":
            new_tokens.append(unks[count])
            count += 1
        else:
            new_tokens.append(t)
    final_sent = "".join(new_tokens)
    return final_sent

The below sentence convertes simplified to traditional.

In [11]:
def convert_trad(sent):
    sentence_idx, unks = words_to_ids(sent)
    if unks:
        logging.debug("UNKs are " + str(" ".join(x for x in unks)))
    indices = greedy_decoding(sentence_idx)
    words = [corpus.dictionary.idx2word[idx] for idx in indices]
    trad_sent = "".join(words)
    trad_sent = decode_unks(trad_sent, unks, sent)
    return trad_sent

The examples for output are:

In [29]:
sent = "香港大学为亚洲之国际大学，借国际化、创新性及跨范畴发挥其影响力；凭卓越研究、优秀教学、知识与技术之交流转移，吸引及培育全球英才；并透过参与环球事务、其地区影响力及与中国内地之紧密连系，为促进社会进步作出贡献。"
convert_trad(sent)

DEBUG:root:Number of chars before and after tokenization: 104 104
DEBUG:root:Number of tokens before and after mappings: 64 64
DEBUG:root:Number of chars in LSTM are: 104
DEBUG:root:Number of unknowns are: 0


'香港大學為亞洲之國際大學，藉國際化、創新性及跨範疇發揮其影響力；憑卓越研究、優秀教學、知識與技術之交流轉移，吸引及培育全球英才；並透過參與環球事務、其地區影響力及與中國內地之緊密連繫，為促進社會進步作出貢獻。'

In [30]:
sent = "自然语言处理是人工智能和语言学领域的分支学科。此领域探讨如何处理及运用自然语言；自然语言处理包括多方面和步骤，基本有认知、理解、生成等部分"
convert_trad(sent)

DEBUG:root:Number of chars before and after tokenization: 69 69
DEBUG:root:Number of tokens before and after mappings: 34 34
DEBUG:root:Number of chars in LSTM are: 69
DEBUG:root:Number of unknowns are: 0


'自然語言處理是人工智能和語言學領域的分支學科。此領域探討如何處理及運用自然語言；自然語言處理包括多方面和步驟，基本有認知、理解、生成等部分'