# Build synthetic data

In [59]:
import json
import pandas as pd

with open("./webtext2019zh/web_text_zh_train_sample.json", "r") as f:
    content = f.readlines()

content = map(json.loads, content)
content = pd.DataFrame(content)

In [60]:
%%time
import re

def cut_sentences(text, min_len=3):
    """
    Cut sentences by their length and punctuation, remove all spaces.
    """
    text = text.replace(" ", "")
    corpus = re.split("[\,\.\?，。？\n]", text)
    corpus = list(filter(lambda x: len(x) >= min_len, corpus))
    return corpus

text = "\n".join(content.content.values)
sentences = cut_sentences(text)
df = pd.DataFrame({
    "seq": sentences
})

CPU times: user 2.04 ms, sys: 1e+03 ns, total: 2.04 ms
Wall time: 2.01 ms


# Mask filling
Inserting masks at random positions in the Wikipedia sentences, and fill them with the language model.

In [76]:
%%time
import random
import math

s = "看到这一幕我简直兴奋得要上天了 更新于上一个答案几个小时后 剧透醒目 论这个世界上还有什么比妖怪夫妇联手更恐怖的事"

def mask_replacing(s):
    """
    The first strategy samples random words in the sentence and it replaces them with masks(one for each token).
    """
    seq = list(s)
    seq_len = len(s)
    # Sample from 1 to 50% chars of the sequence
    k = random.randint(1, math.floor(seq_len * 0.5))
    token_idx = random.choices(range(seq_len), k = k)
    for i in token_idx:
        seq[i] = "[MASK]"
    return "".join(seq)

def mask_replacing2(s):
    """
    The second strategy cre-ates contiguous sequences: 
    it samples a start po-sition s, a length l (uniformly distributed), 
    and it masks all the tokens spanned by words betweenpositions s and s + l.
    """
    seq_len = len(s)
    start = random.randint(1, seq_len-1)
    # At least 10% of words
    min_length = min(math.floor(seq_len * 0.1), seq_len - start)
    min_length = max(min_length, 1)
    # At most 50% of words
    max_length = min(math.floor(seq_len * 0.5), seq_len - start)
    max_length = max(min_length, max_length)
    length = random.choice(range(min_length, max_length+1))
    
    s = s[:start] + "[MASK]" * length + s[(start+length):]
    return pd.Series([s, length / seq_len], index=["masked", "masked_rate"])

masked_seqs = pd.Series(sentences).apply(mask_replacing2)
df["masked"] = masked_seqs["masked"]
df["masked_rate"] = masked_seqs["masked_rate"]
df.head()

CPU times: user 429 ms, sys: 5.01 ms, total: 434 ms
Wall time: 432 ms


Unnamed: 0,seq,masked,masked_rate,filled
0,我对你仍有爱意,我对你仍有爱[MASK],0.142857,我对你仍有感心
1,我对自己无能为力,我[MASK][MASK][MASK][MASK]能为力,0.5,我对自己无力力力
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,0.198953,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...
3,我看了上帝之眼感觉还不错,我看了[MASK][MASK]之眼感觉还不错,0.166667,我看了上帝之眼感觉的感。
4,不喜欢那些讲技术的摄影书,不喜欢那些讲技术的摄[MASK][MASK],0.166667,不、、、、、、、、、、、


In [2]:
# Fill in the masked fields
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('/home/admin/workspace/model/transformers/bert-base-multilingual-cased')
model = AutoModelForMaskedLM.from_pretrained('/home/admin/workspace/model/transformers/bert-base-multilingual-cased')

In [None]:
%%time
import torch

def mask_replacing(text):
#     text = "我对自[MASK][MASK]能为力"
    encoded_input = tokenizer(text, return_tensors='pt')
    [predictions] = model(**encoded_input)

    predicted_index = torch.argmax(predictions[0], dim=1).tolist()
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
    return "".join(predicted_token[1:-1])

filled_seqs = df.masked.apply(mask_replacing).rename("filled")
df["filled"] = filled_seqs
df.head()

In [98]:
# tensorflow model
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")
model = TFAutoModelWithLMHead.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")

All model checkpoint weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the model checkpoint at /home/admin/workspace/model/transformers/bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


.我对自己功能为力力


In [99]:
text = "我对自[MASK][MASK]能为力"
encoded_input = tokenizer(text, return_tensors='tf')
[predictions] = model(encoded_input)

predicted_index = tf.argmax(predictions[0], axis=1)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print("".join(predicted_token[1:-1]))

我对自己功能为力


# Backtranslation
Translate chinese to english, and translate back.

In [117]:
import configparser
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.acs_exception.exceptions import ClientException
from aliyunsdkcore.acs_exception.exceptions import ServerException
from aliyunsdkalimt.request.v20181012.TranslateGeneralRequest import TranslateGeneralRequest

config = configparser.ConfigParser()
config.read("/home/admin/workspace/.secret")

client = AcsClient(config["account xjx"]["access_key"], 
                   config["account xjx"]["access_secret"], 
                   'cn-hangzhou')

In [140]:
l = list(range(1000))
l[::100]

[0, 100, 200, 300, 400, 500, 600, 700, 800, 900]

In [166]:
a = ['a']
b = ['b']
c = ['c']
np.concatenate([a, b, c])

array(['a', 'b', 'c'], dtype='<U1')

In [171]:
%%time
import numpy as np
import json
from joblib import Parallel, delayed
import multiprocessing

class BackTranslation:
    def __init__(self):
        self.bulk_size = 4800
    
    def back_translation(self, corpus):
        translated = self._bulk_translate(corpus, from_lang = "zh", to_lang = "en")
        back_translated = self._bulk_translate(translated, from_lang = "en", to_lang = "zh")
        return back_translated
    
    def _bulk_translate(self, corpus, from_lang = "zh", to_lang = "en"):
        translated = []
        text = ""

        def _do_translate(text, translated):
            translated_text = self._translate(text.strip(), from_lang = from_lang, to_lang = to_lang)
            translated +=  translated_text.split("\n")
            
        for seq in corpus:
            if len(text + seq) >= self.bulk_size:
                _do_translate(text, translated)
                text = seq + "\n"
            else:
                text += seq + "\n"
                
        _do_translate(text, translated)
        
        return translated
    
    def _translate(self, text, from_lang = "zh", to_lang = "en"):
        """
        The api of alimt has limit the maximum length of text to 5000 characters, maximum QPS to 50,
        so we should send the request in several bulks, with less than 250000 characters in each bulk.
        """
        request = TranslateGeneralRequest()
        request.set_accept_format('json')

        request.set_FormatType("text")
        request.set_SourceLanguage(from_lang)
        request.set_TargetLanguage(to_lang)

        request.set_SourceText(text)

        response = client.do_action_with_exception(request)
        response_json = json.loads(response)
    
        try:
            translated = response_json["Data"]["Translated"]
            return translated
        except:
            print(response_json)
            raise Exception("Response error")
    
def parallelize(df, func):
    partitions = multiprocessing.cpu_count()
    df_splited = np.array_split(df, partitions)
    df_splited = Parallel(
        n_jobs=partitions
    )(delayed(func)(df) for df in df_splited)
    return np.concatenate(df_splited)

back_translated = parallelize(df.seq.values, BackTranslation().back_translation)

CPU times: user 168 ms, sys: 2 ms, total: 170 ms
Wall time: 8.27 s


In [175]:
df["back_translated"] = back_translated

In [176]:
df.tail()

Unnamed: 0,seq,masked,masked_rate,filled,back_translated
1011,这样才能酿出酒精度12的葡萄酒,这样才能酿出[MASK][MASK]度12的葡萄酒,0.133333,这样才能酿出酒印度12的葡萄酒,为了酿造酒精 12 酒
1012,新疆的鲜食葡萄很出名,新疆的鲜食葡萄很出[MASK],0.1,新疆的鲜食葡萄很出色,新疆葡萄很有名
1013,酿酒葡萄品质也是很好的,酿酒葡萄[MASK][MASK][MASK]是很好的,0.272727,酿酒葡萄酒酒酒是很好的,酿酒葡萄品质也很好
1014,中国每年生产的葡萄酒,中[MASK][MASK]年生产的葡萄酒,0.2,中国早年生产的葡萄酒,生产的葡萄酒，每年在中国
1015,有很大的比重都是用新疆的酿酒葡萄汁酿造的,有很大的比重都是用新疆的酿酒葡萄[MASK][MASK]造的,0.1,有很大的比重都是用新疆的酿酒葡萄酒釀造的,有很大一部分用于新疆葡萄汁酿造


In [178]:
with open("./homebrewed/test_pred_result_regular_trained.txt") as f:
    corpus = f.readlines()
corpus = map(lambda x: corpus.split("[EOS]"), corpus)
corpus = list(corpus)

In [200]:
back_translated = parallelize(corpus, lambda corpus: BackTranslation().back_translation(corpus))
back_translated

CPU times: user 208 ms, sys: 494 ms, total: 702 ms
Wall time: 8.74 s


array(['如期完成周雄辉集团KPI考核内容的改进建议。', '成功举办员工大讲堂 ~ 智能高速专题在线培训如期举行。',
       '本周未完成和 @ 智博、 @ Liting沟通活动运营工具需求细化将推迟到下周。', ...,
       '本周，与区投资促进局签署了投资促进机构协议，预计下周将讨论结果。', '员工培训的IVR调整计划按时顺利完成。',
       '主要关注用于员工手册的编写 (参考上海公司)。'], dtype='<U70')

In [198]:
with open("./homebrewed/back_translated.txt", "w") as f:
    for (s1, s2) in zip(corpus, back_translated):
        f.write("{}[SEP]{}\n".format(s1, s2))

# Word dropping