# Build synthetic data

In [231]:
import json
import pandas as pd

with open("./webtext2019zh/web_text_zh_train_sample.json", "r") as f:
    content = f.readlines()

content = map(json.loads, content)
content = pd.DataFrame(content)

In [60]:
%%time
import re

def cut_sentences(text, min_len=3):
    """
    Cut sentences by their length and punctuation, remove all spaces.
    """
    text = text.replace(" ", "")
    corpus = re.split("[\,\.\?，。？\n]", text)
    corpus = list(filter(lambda x: len(x) >= min_len, corpus))
    return corpus

text = "\n".join(content.content.values)
sentences = cut_sentences(text)
df = pd.DataFrame({
    "seq": sentences
})

CPU times: user 2.04 ms, sys: 1e+03 ns, total: 2.04 ms
Wall time: 2.01 ms


# Mask filling
Inserting masks at random positions in the Wikipedia sentences, and fill them with the language model.

In [253]:
%%time
import random
import math

s = "看到这一幕我简直兴奋得要上天了 更新于上一个答案几个小时后 剧透醒目 论这个世界上还有什么比妖怪夫妇联手更恐怖的事"

def mask_replacing(s):
    """
    The first strategy samples random words in the sentence and it replaces them with masks(one for each token).
    """
    seq = list(s)
    seq_len = len(s)
    # Sample from 1 to 90% chars of the sequence
    k = random.randint(1, math.floor(seq_len * 0.9))
    token_idx = random.choices(range(seq_len), k = k)
    for i in token_idx:
        seq[i] = "[MASK]"
    masked_rate = len(token_idx) / seq_len
    masked = "".join(seq)
    return pd.Series([masked, masked_rate], index=["masked", "masked_rate"])

def mask_replacing2(s):
    """
    The second strategy cre-ates contiguous sequences: 
    it samples a start po-sition s, a length l (uniformly distributed), 
    and it masks all the tokens spanned by words betweenpositions s and s + l.
    """
    seq_len = len(s)
    start = random.randint(1, seq_len-1)
    # At least 10% of words
    min_length = min(math.floor(seq_len * 0.1), seq_len - start)
    min_length = max(min_length, 1)
    # At most 90% of words
    max_length = min(math.floor(seq_len * 0.9), seq_len - start)
    max_length = max(min_length, max_length)
    length = random.choice(range(min_length, max_length+1))
    
    s = s[:start] + "[MASK]" * length + s[(start+length):]
    return pd.Series([s, length / seq_len], index=["masked", "masked_rate"])

masked_seqs = pd.Series(sentences).apply(mask_replacing)
df["masked"] = masked_seqs["masked"]
df["masked_rate"] = masked_seqs["masked_rate"]
df.head()

CPU times: user 437 ms, sys: 4.99 ms, total: 442 ms
Wall time: 441 ms


Unnamed: 0,seq,masked,masked_rate,filled,back_translated,dropped,dropped_rate
0,我对你仍有爱意,[MASK]对[MASK]仍有爱意,0.285714,我对你仍有爱。,我仍然爱着你,我对你仍爱意,0.142857
1,我对自己无能为力,[MASK][MASK]自[MASK]无能为[MASK],0.5,我的为心功能为力,我对自己无能为力。,对自己无能为力,0.125
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个同[MASK]的事儿吧那年他小学二年级数学课老师讲课的时[MASK][MASK]学跟同...,0.062827,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,让我们谈谈一个同学。当他二年级的数学老师讲课时，这位同学和他的同桌一起玩。同样不寻常的是，数...,讲一个学的事儿吧那年他小学二年级数学课老师讲课的时同学同一块儿玩儿小孩儿贪玩也很寻常不寻常的...,0.136126
3,我看了上帝之眼感觉还不错,[MASK][MASK]了上帝[MASK][MASK]感觉还不错,0.416667,我看了光光之眼感觉还不错,我看到了上帝的眼睛，感觉很好。,我看了帝之眼感觉还不错,0.083333
4,不喜欢那些讲技术的摄影书,[MASK]喜欢那[MASK]讲技[MASK]的摄[MASK][MASK],0.416667,不喜欢那些讲技术的摄影师,我不喜欢那些谈论技术的摄影书籍。,不喜欢那些讲技术影书,0.166667


In [98]:
# tensorflow model
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")
model = TFAutoModelWithLMHead.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")

All model checkpoint weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the model checkpoint at /home/admin/workspace/model/transformers/bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


.我对自己功能为力力


In [283]:
%%time
def mask_filling(text):
#     text = "我对自[MASK][MASK]能为力"
    encoded_input = tokenizer(text, return_tensors='tf')
    [predictions] = model(encoded_input)

    predicted_index = tf.argmax(predictions[0], axis=1)
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
    return "".join(predicted_token[1:-1])

filled_seqs = df.masked[:3].apply(mask_filling).rename("filled")
df["filled"] = filled_seqs
df.head()

CPU times: user 2.95 s, sys: 902 ms, total: 3.85 s
Wall time: 1.81 s


0                                              绝对人仍有爱意
1                                             无能自身无能为人
2    讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...
Name: filled, dtype: object

In [282]:
text = "我对自[MASK][MASK]能为力"
encoded_input = tokenizer(text, return_tensors='tf')
[predictions] = model(encoded_input)

predicted_index = tf.argmax(predictions[0], axis=1)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print("".join(predicted_token[1:-1]))

我对自己功能为力


# Backtranslation
Translate chinese to english, and translate back.

In [117]:
import configparser
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.acs_exception.exceptions import ClientException
from aliyunsdkcore.acs_exception.exceptions import ServerException
from aliyunsdkalimt.request.v20181012.TranslateGeneralRequest import TranslateGeneralRequest

config = configparser.ConfigParser()
config.read("/home/admin/workspace/.secret")

client = AcsClient(config["account xjx"]["access_key"], 
                   config["account xjx"]["access_secret"], 
                   'cn-hangzhou')

In [140]:
l = list(range(1000))
l[::100]

[0, 100, 200, 300, 400, 500, 600, 700, 800, 900]

In [166]:
a = ['a']
b = ['b']
c = ['c']
np.concatenate([a, b, c])

array(['a', 'b', 'c'], dtype='<U1')

In [171]:
%%time
import numpy as np
import json
from joblib import Parallel, delayed
import multiprocessing

class BackTranslation:
    def __init__(self):
        self.bulk_size = 4800
    
    def back_translation(self, corpus):
        translated = self._bulk_translate(corpus, from_lang = "zh", to_lang = "en")
        back_translated = self._bulk_translate(translated, from_lang = "en", to_lang = "zh")
        return back_translated
    
    def _bulk_translate(self, corpus, from_lang = "zh", to_lang = "en"):
        translated = []
        text = ""

        def _do_translate(text, translated):
            translated_text = self._translate(text.strip(), from_lang = from_lang, to_lang = to_lang)
            translated +=  translated_text.split("\n")
            
        for seq in corpus:
            if len(text + seq) >= self.bulk_size:
                _do_translate(text, translated)
                text = seq + "\n"
            else:
                text += seq + "\n"
                
        _do_translate(text, translated)
        
        return translated
    
    def _translate(self, text, from_lang = "zh", to_lang = "en"):
        """
        The api of alimt has limit the maximum length of text to 5000 characters, maximum QPS to 50,
        so we should send the request in several bulks, with less than 250000 characters in each bulk.
        """
        request = TranslateGeneralRequest()
        request.set_accept_format('json')

        request.set_FormatType("text")
        request.set_SourceLanguage(from_lang)
        request.set_TargetLanguage(to_lang)

        request.set_SourceText(text)

        response = client.do_action_with_exception(request)
        response_json = json.loads(response)
    
        try:
            translated = response_json["Data"]["Translated"]
            return translated
        except:
            print(response_json)
            raise Exception("Response error")
    
def parallelize(df, func):
    partitions = multiprocessing.cpu_count()
    df_splited = np.array_split(df, partitions)
    df_splited = Parallel(
        n_jobs=partitions
    )(delayed(func)(df) for df in df_splited)
    return np.concatenate(df_splited)

back_translated = parallelize(df.seq.values, BackTranslation().back_translation)

CPU times: user 168 ms, sys: 2 ms, total: 170 ms
Wall time: 8.27 s


In [175]:
df["back_translated"] = back_translated

In [176]:
df.tail()

Unnamed: 0,seq,masked,masked_rate,filled,back_translated
1011,这样才能酿出酒精度12的葡萄酒,这样才能酿出[MASK][MASK]度12的葡萄酒,0.133333,这样才能酿出酒印度12的葡萄酒,为了酿造酒精 12 酒
1012,新疆的鲜食葡萄很出名,新疆的鲜食葡萄很出[MASK],0.1,新疆的鲜食葡萄很出色,新疆葡萄很有名
1013,酿酒葡萄品质也是很好的,酿酒葡萄[MASK][MASK][MASK]是很好的,0.272727,酿酒葡萄酒酒酒是很好的,酿酒葡萄品质也很好
1014,中国每年生产的葡萄酒,中[MASK][MASK]年生产的葡萄酒,0.2,中国早年生产的葡萄酒,生产的葡萄酒，每年在中国
1015,有很大的比重都是用新疆的酿酒葡萄汁酿造的,有很大的比重都是用新疆的酿酒葡萄[MASK][MASK]造的,0.1,有很大的比重都是用新疆的酿酒葡萄酒釀造的,有很大一部分用于新疆葡萄汁酿造


# Word dropping

In [228]:
def word_dropping(text):
    """
    Randomly drop some words in the sequence
    """
    seq = list(text)
    text_len = len(text)
    k = random.choice([1] + list(range(1, int(text_len/3))))
    for i in random.choices(range(text_len), k = k):
        seq[i] = ""
    dropped_rate = k/text_len
    dropped = "".join(seq)
    return pd.Series([dropped, dropped_rate], index=["dropped", "dropped_rate"])

dropped = df.seq.apply(word_dropping)
df["dropped"] = dropped.dropped
df["dropped_rate"] = dropped.dropped_rate
df.head()

Unnamed: 0,seq,masked,masked_rate,filled,back_translated,dropped,dropped_rate
0,我对你仍有爱意,我对你仍有爱[MASK],0.142857,我对你仍有爱。,我仍然爱着你,我对你仍爱意,0.142857
1,我对自己无能为力,我[MASK][MASK][MASK][MASK]能为力,0.5,我的为心功能为力,我对自己无能为力。,对自己无能为力,0.125
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,0.198953,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,让我们谈谈一个同学。当他二年级的数学老师讲课时，这位同学和他的同桌一起玩。同样不寻常的是，数...,讲一个学的事儿吧那年他小学二年级数学课老师讲课的时同学同一块儿玩儿小孩儿贪玩也很寻常不寻常的...,0.136126
3,我看了上帝之眼感觉还不错,我看了[MASK][MASK]之眼感觉还不错,0.166667,我看了光光之眼感觉还不错,我看到了上帝的眼睛，感觉很好。,我看了帝之眼感觉还不错,0.083333
4,不喜欢那些讲技术的摄影书,不喜欢那些讲技术的摄[MASK][MASK],0.166667,不喜欢那些讲技术的摄影师,我不喜欢那些谈论技术的摄影书籍。,不喜欢那些讲技术影书,0.166667


# Blend these all

In [338]:
import json
import pandas as pd

with open("./webtext2019zh/web_text_zh_train_sample.json", "r") as f:
    data = f.readlines()

data = map(json.loads, content)
data = pd.DataFrame(content)

text = "\n".join(data.content.values)
references = cut_sentences(text)
references[:5]

['我对你仍有爱意',
 '我对自己无能为力',
 '讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常不寻常的是数学老师勃然大怒抓起我同学的衣领拎着他就直接走出教室教室是在四楼这老师竟然直接就把他悬在了走道护杆外的半空同学直接吓尿了（是真的尿）这老师还在骂骂咧咧的威胁以后还敢不敢上课瞎捣蛋结局同学还是安然被“收了回来”但从此他不敢在数学课上放肆从此他的数学成绩没有突破过及格线一个好的老师传',
 '我看了上帝之眼感觉还不错',
 '不喜欢那些讲技术的摄影书']

In [339]:
%%time
def make_candidates(references):
    """
    30% with mask filling rule1: scored by masked_rate
    30% with mask filling rule2: scored by masked_rate
    30% with back translation and word dropping: scored by dropped rate
    10% with back translation: score 0.98
    
    Returns
    -------
    candidates: Generated candidates with the same length of references
    scores: Arbitrary scores
    """
    # Do not modify input params
    refs = references.copy()
    random.shuffle(refs)
    references = refs.copy()
    
    ref_len = len(references)

    mf1_len = mf2_len = int(ref_len*0.3)
    candidates = []
    scores = []

    # Mask filling
    mf1 = map(mask_replacing, references[:mf1_len])
    del references[:mf1_len]
    mf2 = map(mask_replacing2, references[:mf2_len])
    del references[:mf2_len]
    mf = pd.DataFrame(list(mf1) + list(mf2))
    mf_filled = mf.masked.apply(mask_filling)

    candidates = mf_filled.tolist()
    scores += (1 - mf.masked_rate).values.tolist()
    
    # Back translation
    bt = parallelize(references, lambda refs: BackTranslation().back_translation(refs)).tolist()
    # Apply 30% with word dropping
    wd_len = int(ref_len * 0.3)
    bt_dropped = map(word_dropping, bt[:wd_len])
    bt_dropped = pd.DataFrame(bt_dropped)
    candidates += bt_dropped.dropped.tolist()
    scores += (1 - bt_dropped.dropped_rate).tolist()
    
    del bt[:wd_len]
    candidates += bt
    scores += [1.0] * len(bt)

    return refs, candidates, scores
   
[refs, candidates, scores] = make_candidates(references)

CPU times: user 8min 55s, sys: 2min 33s, total: 11min 28s
Wall time: 5min 59s


In [344]:
dataset = pd.DataFrame({
    "reference": refs,
    "candidate": candidates,
    "score": scores
})

In [348]:
dataset[dataset.score>0.9]

Unnamed: 0,reference,candidate,score
12,反正我的亲戚朋友们虽然不至于像我爱我家里面那样,反正我的亲戚朋友们虽然不至爱像我爱我家里面那样,0.956522
25,目前苹果在中国销量堪忧,目前苹果在中国销量堪布,0.909091
28,先说一说毁灭之锤奥格瑞姆你个锤子！这是黑暗之门后各位兽人酋长发自内心的咆哮这个是还在当角斗士的萨尔,先说一说毁灭之锤奥格瑞姆你个锤子！这是黑暗之门后各位兽人酋长发自内心的咆哮这个是还在当角斗士的萨尔,0.979592
30,毕竟这也是我见过最奇葩的人,毕竟这也是我见过最奇[UNK]的。,0.923077
31,就推荐两家平价外贸单：1,就推荐两家平价经贸单：1,0.916667
34,”这就是一个纯辅助技能,[UNK]这就是一个纯辅助技巧,0.909091
70,让你突然有一天全部放弃,让你果然有一天全部放弃,0.909091
85,不经意地挽了我爸的胳膊,不经意地挽了我国的[UNK][UNK],0.909091
100,舞台剧噂的男美丽星期天,舞台剧噂的男美丽星期天,0.909091
103,据老师透露课程方案全面改革,据老师透露工程方案全面改革,0.923077


In [343]:
# @TODO randomly mix other 0 scored references