<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 1: Translate English BERT-Large-Whole-Word-Masking Vocabulary into Chinese via ECDICT**

In [None]:
import pandas as pd

# Generate English-Chinese dictionary via ecdict.
# ecdict is from https://github.com/skywind3000/ECDICT
df = pd.read_csv("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\ecdict.txt",sep=",")
zh_en_dict = dict(zip(df.word, df.translation))

def read_vocabulary_and_translate(vocab_path,new_path,bi_dict):
    '''
    Accepts a BERT-wwm vocabulary and generates its translation
    vocab_path: BERT-wwm vocabulary path
    new_path: Translated vocabulary path
    bi_dict: Bilingual dictionary, such as zh_en_dict above
    '''
    word_list = []
    new_word_list = []
    with open(vocab_path,encoding="UTF-8") as vocab:
        vocab_contents = vocab.readlines()
        for line in vocab_contents:
            word = line.strip()
            word_list.append(word)
            if word.isdigit():
                new_word = word
            elif word in zh_en_dict.keys():
                new_word = bi_dict[word]
            elif word.lower() in zh_en_dict.keys():
                new_word = bi_dict[word.lower()]
            else:
                new_word = word
            new_word_list.append(new_word)
    with open(new_path,'a',encoding="UTF-8") as obj:
        for item in new_word_list:
            obj.write(item + "\n")

# Generate trans.txt
read_vocabulary_and_translate("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-large-cased-whole-word-masking\\vocab.txt",
                            "E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\trans_dict.txt",zh_en_dict)

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 2: Process Chinese-English Dictionary into available Python data structure and clean the dictionary**

In [None]:
# Generate Chinese-English dictionary
# Dictionary is from https://www.mdbg.net/chinese/dictionary?page=cedict
import jionlp as jio
import re

with open('E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\cedict_ts.u8',encoding="UTF-8") as file:
    cedict = file.readlines()

def parse_line(line):
    parsed = {}
    if line == '':
        cedict.remove(line)
        return 0
    if line.startswith("#"):
        cedict.remove(line)
        return 0
    line = line.rstrip('/')
    line = line.split('/')
    if len(line) <= 1:
        return 0
    english = line[1]
    if "/" in english:
        english = english.split("/")[0]
    char_and_pinyin = line[0].split('[')
    characters = char_and_pinyin[0]
    characters = characters.split()
    traditional = characters[0]
    simplified = characters[1]
    pinyin = char_and_pinyin[1]
    pinyin = pinyin.rstrip()
    pinyin = pinyin.rstrip("]")
    parsed['traditional'] = traditional
    parsed['simplified'] = simplified
    parsed['pinyin'] = pinyin
    parsed['english'] = english
    return parsed

def is_contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

def clean_parsed_dict(parsed_dict):
    simple_punctuation = '[;|.]'
    for k,v in list(parsed_dict.items()):
        clean = jio.remove_parentheses(v)
        no_punctuation = re.sub(simple_punctuation, '', clean)
        parsed_dict[k] = no_punctuation
    for key,value in list(parsed_dict.items()):
        if is_contain_chinese(value):
            del parsed_dict[key]
        elif "lit" in value:
            del parsed_dict[key]
        elif "," in value:
            parsed_dict[key] = value.split(",")[0].strip()
    return parsed_dict

# Generate cedict.txt
parsed_dict = {}
for line in cedict:
    part_parsed = parse_line(line)
    simplified = part_parsed['simplified']
    english = part_parsed["english"]
    parsed_dict[simplified] = english
parsed_dict = clean_parsed_dict(parsed_dict)
file = open('E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\cedict_jio.txt', 'w',encoding="UTF-8") 
for k,v in parsed_dict.items():
    if v != "":
        file.write(str(k)+' '+str(v)+'\n')
file.close()

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 3: Translate English BERT-base-uncased vocabulary into Chinese via Baidu Translate and clean it**

In [None]:
# -*- coding: utf-8 -*-

# This code shows an example of text translation from English to Simplified-Chinese.
# This code runs on Python 2.7.x and Python 3.x.
# You may install `requests` to run this code: pip install requests
# Please refer to `https://api.fanyi.baidu.com/doc/21` for complete api document

import requests
import random
import json
from hashlib import md5

# Set your own appid/appkey.
appid = '20210926000957196'
appkey = 'tPgtGyQQuvfgsAMbbUvK'

# For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21`
from_lang = 'en'
to_lang =  'zh'

endpoint = 'http://api.fanyi.baidu.com'
path = '/api/trans/vip/translate'
url = endpoint + path

# query = 'Hello World! This is 1st paragraph.\nThis is 2nd paragraph.'

# Generate salt and sign
def make_md5(s, encoding='utf-8'):
    return md5(s.encode(encoding)).hexdigest()

new_word_list = []
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased\\vocab.txt","r",encoding="UTF-8") as file:
    contents = file.readlines()
for item in contents:
    item = item.strip()
    if item.startswith("##"):
        new_word_list.append(item)
        continue
    elif item.startswith("["):
        new_word_list.append(item)
        continue
    else:
        salt = random.randint(32768, 65536)
        sign = make_md5(appid + item + str(salt) + appkey)
        headers = {'Content-Type': 'application/x-www-form-urlencoded'}
        payload = {'appid': appid, 'q': item, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign}
        r = requests.post(url, params=payload, headers=headers)
        result = r.json()
        json_string = json.dumps(result, indent=4, ensure_ascii=False)
        string = json.loads(json_string)
        try:
            trans_word = string["trans_result"][0]["dst"]
            new_word_list.append(trans_word)
        except KeyError:
            new_word_list.append(item)
    
    with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\BERT-base-uncased-trans.txt","w",encoding="UTF-8") as file:
        for new_word in new_word_list:
            file.write(new_word + "\n")

import texthero as hero
import jionlp as jio
import re

processed = []
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\BERT-base-uncased-trans.txt","r",encoding="UTF-8") as f:
    contents = f.readlines()
for i in contents:
    i = i.strip()
    if len(i) == 1:
        processed.append(i)
    elif i.startswith("["):
        processed.append(i)
    elif i.startswith("#"):
        processed.append(i)
    else:
        no_parenthesis = jio.remove_parentheses(i)
        no_punctuation = re.sub(r'[^\w\s]', '', no_parenthesis)
        no_space = no_punctuation.replace(" ","")
        processed.append(no_space)

with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\BERT-base-uncased-trans-processed.txt","w",encoding="UTF-8") as file:
        for process in processed:
            file.write(process + "\n")

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 4: Clean English-Chinese dictionary generated from Part 1**

In [None]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\trans_dict.txt","r",encoding="UTF-8") as f:
    trans_dict_contents = f.readlines()

def containenglish(test_string):
    return bool(re.search('[a-zA-Z]', test_string))

without_line = []
for item in trans_dict_contents:
    item = item.strip()
    if "\\" in item:
        new_item = item.split("\\")[0].strip()
        without_line.append(new_item)
    else:
        without_line.append(item)

no_role = []
for no_l in without_line:
    if len(no_l) != 1 and "." in no_l:
        without_role = no_l.split(".")[1]
        no_role.append(without_role)
    else:
        no_role.append(no_l)
    
no_brackets = []
for no_r in no_role:
    if is_contain_chinese(no_r):
        without_b = jio.remove_parentheses(no_r)
        no_brackets.append(without_b)
    else:
        no_brackets.append(no_r)

no_punctuation = []
simple_punctuation = '[;,；，]'
for no_b in no_brackets:
    without_punctuation = re.sub(simple_punctuation, ' ', no_b)
    no_punctuation.append(without_punctuation)

no_space = []
for no_pu in no_punctuation:
    no_pu = no_pu.strip()
    if " " in no_pu:
        without_s = no_pu.split(" ")[0]
        no_space.append(without_s)
    else:
        no_space.append(no_pu)

no_tense_things = []
for no_s in no_space:
    if containenglish(no_s) and "的" in no_s:
        without_tense = no_s.split("的")[0]
        no_tense_things.append(without_tense)
    else:
        no_tense_things.append(no_s)

with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\BERT-large-wwm-dict-processed.txt","w",encoding="UTF-8") as file:
        for ntt in no_tense_things:
            file.write(ntt + "\n")

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 5: Check the difference between CEDICT-jio and WoBERT vocabulary**

In [None]:
with open('E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\cedict_jio.txt', 'r',encoding="UTF-8") as cedictjio:
    contents = cedictjio.readlines()

with open('E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\chinese_wobert_L-12_H-768_A-12\\vocab.txt', 'r',encoding="UTF-8") as wobert:
    vocabs = wobert.readlines()

cedict_entry = []
for i in contents:
    i = i.strip()
    entry = i.split(" ")[0]
    cedict_entry.append(entry)

wobert_vocab = []
for w in vocabs:
    w = w.strip()
    wobert_vocab.append(w)

retD = list(set(wobert_vocab).difference(set(cedict_entry)))
for k,g in enumerate(retD):
    if is_contain_chinese(g):
        pass
    else:
        del retD[k]

print(len(retD))
# print(retD)

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 6: Translate English BERT-base-uncased Vocabulary into Chinese via ECDICT**

In [None]:
import pandas as pd

# Generate English-Chinese dictionary via ecdict.
# ecdict is from https://github.com/skywind3000/ECDICT
df = pd.read_csv("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\ecdict.txt",sep=",")
zh_en_dict = dict(zip(df.word, df.translation))

def read_vocabulary_and_translate(vocab_path,new_path,bi_dict):
    '''
    Accepts a BERT-wwm vocabulary and generates its translation
    vocab_path: BERT-wwm vocabulary path
    new_path: Translated vocabulary path
    bi_dict: Bilingual dictionary, such as zh_en_dict above
    '''
    word_list = []
    new_word_list = []
    with open(vocab_path,encoding="UTF-8") as vocab:
        vocab_contents = vocab.readlines()
        for line in vocab_contents:
            word = line.strip()
            word_list.append(word)
            if word.isdigit():
                new_word = word
            elif word in zh_en_dict.keys():
                new_word = bi_dict[word]
            elif word.lower() in zh_en_dict.keys():
                new_word = bi_dict[word.lower()]
            else:
                new_word = word
            new_word_list.append(new_word)
    with open(new_path,'a',encoding="UTF-8") as obj:
        for item in new_word_list:
            obj.write(item + "\n")

# Generate trans.txt
read_vocabulary_and_translate("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased\\vocab.txt",
                            "E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\bert_base_uncased_trans_dict.txt",zh_en_dict)

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 6: Clean English-Chinese dictionary generated from Part 5**

In [None]:
import jionlp as jio
import re
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\bert_base_uncased_trans_dict.txt","r",encoding="UTF-8") as f:
    trans_dict_contents = f.readlines()

def containenglish(test_string):
    return bool(re.search('[a-zA-Z]', test_string))

def is_contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

without_line = []
for item in trans_dict_contents:
    item = item.strip()
    if r"\n" in item:
        new_item = item.replace(r"\n"," ")
        without_line.append(new_item)
    else:
        without_line.append(item)

no_propes = []
for no_l in without_line:
    match = re.search("[\\[\u4e00-\u9fa5\\]]",no_l)
    if "[PAD]" in no_l or "[MASK]" in no_l or "[SEP]" in no_l or "[CLS]" in no_l or "[UNK]" in no_l or "unused" in no_l:
        no_propes.append(no_l)
    elif match:
        no_proper = no_l.split("[")[0]
        no_propes.append(no_proper)
    else:
        no_propes.append(no_l)

no_brackets = []
for no_pr in no_propes:
    if is_contain_chinese(no_pr):
        without_b = jio.remove_parentheses(no_pr)
        no_brackets.append(without_b)
    else:
        no_brackets.append(no_pr)

no_role = []
for no_br in no_brackets:
    no_ro = no_br.replace("pron."," ")
    no_ro = no_ro.replace("n."," ")
    no_ro = no_ro.replace("a."," ")
    no_ro = no_ro.replace("adv."," ")
    no_ro = no_ro.replace("vbl."," ")
    no_ro = no_ro.replace("v."," ")
    no_ro = no_ro.replace("vt."," ")
    no_ro = no_ro.replace("vi."," ")
    no_ro = no_ro.replace("abbr."," ")
    no_ro = no_ro.replace("interj."," ")
    no_ro = no_ro.replace("conj."," ")
    no_ro = no_ro.replace("art."," ")
    no_ro = no_ro.replace("prep."," ")
    no_ro = no_ro.replace("num."," ")
    no_ro = no_ro.replace("aux."," ")
    no_ro = no_ro.replace("pl."," ")
    no_ro = no_ro.strip()
    no_role.append(no_ro)

no_punctuation = []
simple_punctuation = '[;,；，]'
for no_rol in no_role:
    without_punctuation = re.sub(simple_punctuation, ' ', no_rol)
    no_punctuation.append(without_punctuation)


no_space = []
for no_pu in no_punctuation:
    no_pu = no_pu.strip()
    if " " in no_pu:
        without_s = no_pu.split()
        no_space.append(without_s)
    else:
        no_space.append(no_pu)

no_tense_things = []
for no_s in no_space:
    if type(no_s) == str:
        if containenglish(no_s) and "的" in no_s:
            no_tt = no_s.split("的")[0]
            no_tense_things.append(no_tt)
        elif is_contain_chinese(no_s) and "..." in no_s:
            no_s = no_s.replace("...","")
            no_tense_things.append(no_s)
        else:
            no_tense_things.append(no_s)
    else:
        for i,single in enumerate(no_s):
            if containenglish(single) and "的" in single:
                del no_s[i]
            elif "..." in single:
                no_s[i] = single.replace("...","")
            else:
                pass
        no_tense_things.append(no_s)
                
# print(no_tense_things)
bert_embedding_help_dict = {}
for i, item in enumerate(no_tense_things):
    bert_embedding_help_dict[i] = item
# print(bert_embedding_help_dict)

<style>
.text_cell_render {
font-family: Times New Roman, serif;
}
</style>
**Part 7: Find single word in CEDICT-jio**

In [2]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\cedict_jio.txt","r",encoding="UTF-8") as f:
    contents = f.readlines()

single_word_corresponding_chinese_list = []
for i in contents:
    i = i.strip()
    chinese = i.split(" ",1)[0]
    english = i.split(" ",1)[1]

    if len(english.split(" ")) > 1:
        pass
    else:
        single_word_corresponding_chinese_list.append(chinese)

print(len(single_word_corresponding_chinese_list))

32757


In [3]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\facebook_dict\\zh-en.txt","r",encoding="UTF-8") as fi:
    fb_contents = fi.readlines()

fb_word = []
for line in fb_contents:
    line = line.strip()
    chinese = line.split(" ")[0]
    fb_word.append(chinese)

print(len(fb_word))

21597


In [4]:
new_list = fb_word + single_word_corresponding_chinese_list

In [5]:
new_list = list(set(new_list))
print(len(new_list))

43319


In [6]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\big_vocab.txt","r",encoding="UTF-8") as jieba:
    jieba_contents = jieba.readlines()

jieba_list = [i.strip() for i in jieba_contents]
new_list = list(set(new_list + jieba_list))
print(len(new_list))

202591


In [9]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed\\vocab.txt","r",encoding="UTF-8") as bert_embedding:
    bert_contents = bert_embedding.readlines()

bert_vocab = [i.strip() for i in bert_contents]
not_in_bert = list(set(new_list).difference(set(bert_vocab)))
print(len(not_in_bert))

full_vocab = bert_vocab + not_in_bert
print(len(full_vocab))
'''
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\full_vocab.txt","w",encoding="UTF-8") as full:
    for v in full_vocab:
        full.write(v + "\n")
'''

186445
234169


'\nwith open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\full_vocab.txt","w",encoding="UTF-8") as full:\n    for v in full_vocab:\n        full.write(v + "\n")\n'

In [10]:
from transformers import BertModel
from tokenization_wobert import WoBertTokenizer

model = BertModel.from_pretrained("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed")
model.resize_token_embeddings(len(full_vocab))
print(model)
model.save_pretrained("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed-full-vocab")
# tokenizer.save_pretrained("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT")

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(234169, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
  

In [2]:
with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed\\vocab.txt","r",encoding="UTF-8") as bert_47000:
    bert_contents = bert_47000.readlines()

bert_list = [i.strip() for i in bert_contents]

with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\chinese_wobert_L-12_H-768_A-12\\vocab.txt","r",encoding="UTF-8") as wobert:
    wobert_contents = wobert.readlines()

wobert_list = [i.strip() for i in wobert_contents]
not_in_bert_47000 = list(set(wobert_list).difference(set(bert_list)))
print(len(not_in_bert_47000))

vocab = bert_list + not_in_bert_47000
print(len(vocab))

with open("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\data\\full_vocab.txt","w",encoding="UTF-8") as full:
    for v in vocab:
        full.write(v + "\n")


22577
70302


In [3]:
from transformers import BertModel
from tokenization_wobert import WoBertTokenizer

model = BertModel.from_pretrained("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed")
model.resize_token_embeddings(len(vocab))
print(model)
model.save_pretrained("E:\\Steve_Zeng_Related\\YLab\\Translation_BERT_project\\code_project\\BERT\\bert-base-uncased-embedding-changed-full-vocab")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\QINGCH~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.103 seconds.
Prefix dict has been built successfully.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(70302, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
   