# Build synthetic data

In [59]:
import json
import pandas as pd

with open("./webtext2019zh/web_text_zh_train_sample.json", "r") as f:
    content = f.readlines()

content = map(json.loads, content)
content = pd.DataFrame(content)

In [60]:
%%time
import re

def cut_sentences(text, min_len=3):
    """
    Cut sentences by their length and punctuation, remove all spaces.
    """
    text = text.replace(" ", "")
    corpus = re.split("[\,\.\?，。？\n]", text)
    corpus = list(filter(lambda x: len(x) >= min_len, corpus))
    return corpus

text = "\n".join(content.content.values)
sentences = cut_sentences(text)
df = pd.DataFrame({
    "seq": sentences
})

CPU times: user 2.04 ms, sys: 1e+03 ns, total: 2.04 ms
Wall time: 2.01 ms


# Mask filling
Inserting masks at random positions in the Wikipedia sentences, and fill them with the language model.

In [76]:
%%time
import random
import math

s = "看到这一幕我简直兴奋得要上天了 更新于上一个答案几个小时后 剧透醒目 论这个世界上还有什么比妖怪夫妇联手更恐怖的事"

def mask_replacing(s):
    """
    The first strategy samples random words in the sentence and it replaces them with masks(one for each token).
    """
    seq = list(s)
    seq_len = len(s)
    # Sample from 1 to 50% chars of the sequence
    k = random.randint(1, math.floor(seq_len * 0.5))
    token_idx = random.choices(range(seq_len), k = k)
    for i in token_idx:
        seq[i] = "[MASK]"
    return "".join(seq)

def mask_replacing2(s):
    """
    The second strategy cre-ates contiguous sequences: 
    it samples a start po-sition s, a length l (uniformly distributed), 
    and it masks all the tokens spanned by words betweenpositions s and s + l.
    """
    seq_len = len(s)
    start = random.randint(1, seq_len-1)
    # At least 10% of words
    min_length = min(math.floor(seq_len * 0.1), seq_len - start)
    min_length = max(min_length, 1)
    # At most 50% of words
    max_length = min(math.floor(seq_len * 0.5), seq_len - start)
    max_length = max(min_length, max_length)
    length = random.choice(range(min_length, max_length+1))
    
    s = s[:start] + "[MASK]" * length + s[(start+length):]
    return pd.Series([s, length / seq_len], index=["masked", "masked_rate"])

masked_seqs = pd.Series(sentences).apply(mask_replacing2)
df["masked"] = masked_seqs["masked"]
df["masked_rate"] = masked_seqs["masked_rate"]
df.head()

CPU times: user 429 ms, sys: 5.01 ms, total: 434 ms
Wall time: 432 ms


Unnamed: 0,seq,masked,masked_rate,filled
0,我对你仍有爱意,我对你仍有爱[MASK],0.142857,我对你仍有感心
1,我对自己无能为力,我[MASK][MASK][MASK][MASK]能为力,0.5,我对自己无力力力
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,0.198953,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...
3,我看了上帝之眼感觉还不错,我看了[MASK][MASK]之眼感觉还不错,0.166667,我看了上帝之眼感觉的感。
4,不喜欢那些讲技术的摄影书,不喜欢那些讲技术的摄[MASK][MASK],0.166667,不、、、、、、、、、、、


In [2]:
# Fill in the masked fields
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('/home/admin/workspace/model/transformers/bert-base-multilingual-cased')
model = AutoModelForMaskedLM.from_pretrained('/home/admin/workspace/model/transformers/bert-base-multilingual-cased')

In [None]:
%%time
import torch

def mask_replacing(text):
#     text = "我对自[MASK][MASK]能为力"
    encoded_input = tokenizer(text, return_tensors='pt')
    [predictions] = model(**encoded_input)

    predicted_index = torch.argmax(predictions[0], dim=1).tolist()
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
    return "".join(predicted_token[1:-1])

filled_seqs = df.masked.apply(mask_replacing).rename("filled")
df["filled"] = filled_seqs
df.head()

In [98]:
# tensorflow model
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")
model = TFAutoModelWithLMHead.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")

All model checkpoint weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the model checkpoint at /home/admin/workspace/model/transformers/bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


.我对自己功能为力力


In [99]:
text = "我对自[MASK][MASK]能为力"
encoded_input = tokenizer(text, return_tensors='tf')
[predictions] = model(encoded_input)

predicted_index = tf.argmax(predictions[0], axis=1)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print("".join(predicted_token[1:-1]))

我对自己功能为力


# Backtranslation
Translate chinese to english, and translate back.

In [115]:
df.head()

Unnamed: 0,seq,masked,masked_rate,filled
0,我对你仍有爱意,我对你仍有爱[MASK],0.142857,我对你仍有爱。
1,我对自己无能为力,我[MASK][MASK][MASK][MASK]能为力,0.5,我的为心功能为力
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,0.198953,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...
3,我看了上帝之眼感觉还不错,我看了[MASK][MASK]之眼感觉还不错,0.166667,我看了光光之眼感觉还不错
4,不喜欢那些讲技术的摄影书,不喜欢那些讲技术的摄[MASK][MASK],0.166667,不喜欢那些讲技术的摄影师


# Word dropping