# Build synthetic data

In [3]:
import json
import pandas as pd
import numpy as np

# Read dataset
content = pd.read_json("./webtext2019zh/web_text_zh_train_sample.json", lines=True)
content.head()

Unnamed: 0,answer_id,answerer_tags,content,desc,qid,star,title,topic
0,106030335,傲娇但不病,我对你仍有爱意，我对自己无能为力,情感,47448139,3,有哪些富有哲理又押韵的短句？（最好是情感方面）,情感
1,74154645,买不起刀的游侠,讲一个同学的事儿吧 那年他小学二年级 数学课 老师讲课的时候 同学跟同桌一块儿玩儿 小孩儿贪...,可以是正面或负面,37916573,4,有哪些老师影响了你的一生？,生活
2,72665930,除了我的专业什么都会一点,我看了上帝之眼感觉还不错。不喜欢那些讲技术的摄影书， 喜欢美用诗意和哲理来表现。,,37403484,4,对摄影师来说，一本好的摄影书是什么样子？,摄影
3,91554224,享乐一时/投身考研大军/老子要有钱,汉子已经被我收啦 现在是男票了哈哈哈哈哈 放两张聊天记录你们感受下,,40985905,12,一个撩妹高手遇到一个撩汉高手是一种怎样的体验？,恋爱技巧
4,89481895,爱打游戏并且也学游戏的少女,看到这一幕我简直兴奋得要上天了 更新于上一个答案几个小时后 剧透醒目 论这个世界上还有什么比...,纸牌屋第四季于2016年3月4日回归<img src,40991383,4,如何评价《纸牌屋》第四季？,美剧


In [11]:
%%time
import re

def cut_sentences(text, min_len=3, max_len=100):
    """
    Cut sentences by their length and punctuation, remove all spaces.
    """
    text = text.replace(" ", "")
    corpus = re.split("[\,\.\?，。？\n]", text)
    corpus = filter(lambda x: len(x) >= min_len, corpus)
    corpus = map(lambda x: x[:max_len], corpus)
    return list(corpus)

text = "\n".join(content.content.values)
sentences = cut_sentences(text)
df = pd.DataFrame({
    "seq": sentences
})

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 1.63 ms


In [32]:
# Prepare data uploaded to ODPS
# command: odpscmd -e "tunnel upload ./raw_text.csv bleurt_dataset"
df_raw_text = df.copy()
df_raw_text.reset_index(inplace=True)
df_raw_text.to_csv("./data_generated/raw_text.csv", index=None, header=None)


# Mask filling
Inserting masks at random positions in the Wikipedia sentences, and fill them with the language model.

In [3]:
%%time
import random
import math

s = "看到这一幕我简直兴奋得要上天了 更新于上一个答案几个小时后 剧透醒目 论这个世界上还有什么比妖怪夫妇联手更恐怖的事"

def mask_replacing(s):
    """
    The first strategy samples random words in the sentence and it replaces them with masks(one for each token).
    """
    seq = list(s)
    seq_len = len(s)
    # Sample from 1 to 90% chars of the sequence
    k = random.randint(1, math.floor(seq_len * 0.9))
    token_idx = random.choices(range(seq_len), k = k)
    for i in token_idx:
        seq[i] = "[MASK]"
    masked_rate = len(token_idx) / seq_len
    masked = "".join(seq)
    return pd.Series([masked, masked_rate], index=["masked", "masked_rate"])

def mask_replacing2(s):
    """
    The second strategy cre-ates contiguous sequences: 
    it samples a start po-sition s, a length l (uniformly distributed), 
    and it masks all the tokens spanned by words betweenpositions s and s + l.
    """
    seq_len = len(s)
    start = random.randint(1, seq_len-1)
    # At least 10% of words
    min_length = min(math.floor(seq_len * 0.1), seq_len - start)
    min_length = max(min_length, 1)
    # At most 90% of words
    max_length = min(math.floor(seq_len * 0.9), seq_len - start)
    max_length = max(min_length, max_length)
    length = random.choice(range(min_length, max_length+1))
    
    s = s[:start] + "[MASK]" * length + s[(start+length):]
    return pd.Series([s, length / seq_len], index=["masked", "masked_rate"])

masked_seqs = pd.Series(sentences).apply(mask_replacing)
df["masked"] = masked_seqs["masked"]
df["masked_rate"] = masked_seqs["masked_rate"]
df.head()

CPU times: user 394 ms, sys: 13.5 ms, total: 407 ms
Wall time: 406 ms


Unnamed: 0,seq,masked,masked_rate
0,我对你仍有爱意,[MASK]对你仍有爱意,0.142857
1,我对自己无能为力,我对自己无[MASK]为力,0.125
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,讲一个[MASK]学[MASK]事[MASK][MASK]那年他小[MASK]二年级数[MA...,0.204188
3,我看了上帝之眼感觉还不错,我[MASK][MASK][MASK]帝之眼感[MASK][MASK]不错,0.666667
4,不喜欢那些讲技术的摄影书,[MASK][MASK][MASK][MASK]些[MASK]技[MASK][MASK]摄影书,0.75


In [4]:
# tensorflow model
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")
model = TFAutoModelWithLMHead.from_pretrained("/home/admin/workspace/model/transformers/bert-base-multilingual-cased")

All model checkpoint weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the model checkpoint at /home/admin/workspace/model/transformers/bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [58]:
%%time
def mask_filling(input_texts, batch_size=16):
#     input_texts = ["我对自[MASK][MASK]能为力"]
    if len(input_texts) < batch_size:
        input_texts = [input_texts]
    else:
        # Break input texts into chunks of batch_size
        # to avoid OOM
        n = batch_size
        input_texts = [input_texts[i * n:(i + 1) * n] \
                       for i in range((len(input_texts) + n - 1) // n )]
    
    filled_seqs = []
    for chunk_texts in input_texts:
        encoded_input = tokenizer(chunk_texts, padding=True, return_tensors='tf')
        [predictions] = model(encoded_input)
        predicted_index = tf.argmax(predictions, axis=2)
        predicted_tokens = [tokenizer.convert_ids_to_tokens(index) for index in predicted_index]
        filled_seqs += ["".join(predict_token[1:np.sum(mask)-1]) \
                       for [predict_token, mask] in zip(predicted_tokens, encoded_input["attention_mask"])]
    return filled_seqs

filled_seqs = mask_filling(df.masked.values.tolist()[0:100], batch_size=16)
# df["filled"] = filled_seqs
# df.head()

(16, 193)
(16, 26)
(16, 43)
(16, 40)
(16, 42)
(16, 25)
(4, 18)
CPU times: user 2.8 s, sys: 507 ms, total: 3.31 s
Wall time: 2.78 s


In [6]:
text = "我对自[MASK][MASK]能为力"
encoded_input = tokenizer(text, return_tensors='tf')
[predictions] = model(encoded_input)

predicted_index = tf.argmax(predictions[0], axis=1)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print("".join(predicted_token[1:-1]))

我对自己功能为力


# Backtranslation
Translate chinese to english, and translate back.

In [7]:
import configparser
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.acs_exception.exceptions import ClientException
from aliyunsdkcore.acs_exception.exceptions import ServerException
from aliyunsdkalimt.request.v20181012.TranslateGeneralRequest import TranslateGeneralRequest

config = configparser.ConfigParser()
config.read("/home/admin/workspace/.secret")

client = AcsClient(config["account xjx"]["access_key"], 
                   config["account xjx"]["access_secret"], 
                   'cn-hangzhou')

In [9]:
%%time
import numpy as np
import json
from joblib import Parallel, delayed
import multiprocessing

class BackTranslation:
    def __init__(self):
        self.bulk_size = 4800
    
    def back_translation(self, corpus):
        translated = self._bulk_translate(corpus, from_lang = "zh", to_lang = "en")
        back_translated = self._bulk_translate(translated, from_lang = "en", to_lang = "zh")
        return back_translated
    
    def _bulk_translate(self, corpus, from_lang = "zh", to_lang = "en"):
        translated = []
        text = ""

        def _do_translate(text, translated):
            translated_text = self._translate(text.strip(), from_lang = from_lang, to_lang = to_lang)
            translated +=  translated_text.split("\n")
            
        for seq in corpus:
            if len(text + seq) >= self.bulk_size:
                _do_translate(text, translated)
                text = seq + "\n"
            else:
                text += seq + "\n"
                
        _do_translate(text, translated)
        
        return translated
    
    def _translate(self, text, from_lang = "zh", to_lang = "en"):
        """
        The api of alimt has limit the maximum length of text to 5000 characters, maximum QPS to 50,
        so we should send the request in several bulks, with less than 250000 characters in each bulk.
        """
        request = TranslateGeneralRequest()
        request.set_accept_format('json')

        request.set_FormatType("text")
        request.set_SourceLanguage(from_lang)
        request.set_TargetLanguage(to_lang)

        request.set_SourceText(text)

        response = client.do_action_with_exception(request)
        response_json = json.loads(response)
    
        try:
            translated = response_json["Data"]["Translated"]
            return translated
        except:
            print(response_json)
            raise Exception("Response error")
    
def parallelize(df, func):
    partitions = multiprocessing.cpu_count()
    df_splited = np.array_split(df, partitions)
    df_splited = Parallel(
        n_jobs=partitions
    )(delayed(func)(df) for df in df_splited)
    return np.concatenate(df_splited)

back_translated = parallelize(df.seq.values, BackTranslation().back_translation)

CPU times: user 169 ms, sys: 986 ms, total: 1.15 s
Wall time: 7.51 s


In [10]:
df["back_translated"] = back_translated

In [11]:
df.tail()

Unnamed: 0,seq,masked,masked_rate,filled,back_translated
1011,这样才能酿出酒精度12的葡萄酒,这[MASK][MASK][MASK]酿[MASK][MASK][MASK][MASK][M...,0.8,,这样就可以酿造酒精含量为 12 的葡萄酒。
1012,新疆的鲜食葡萄很出名,新疆的鲜食[MASK]萄很出名,0.1,,新疆以其新鲜的葡萄而闻名。
1013,酿酒葡萄品质也是很好的,[MASK]酒[MASK][MASK]品[MASK][MASK]是很好[MASK],0.818182,,酿酒葡萄质量也很好
1014,中国每年生产的葡萄酒,[MASK][MASK]每年[MASK]产的[MASK]萄酒,0.6,,中国每年生产的葡萄酒
1015,有很大的比重都是用新疆的酿酒葡萄汁酿造的,有很大的比重都是用新疆的[MASK]酒葡萄汁酿造的,0.05,,很大一部分是用新疆的酿酒葡萄汁酿造的。


# Word dropping

In [12]:
def word_dropping(text):
    """
    Randomly drop some words in the sequence
    """
    seq = list(text)
    text_len = len(text)
    k = random.choice([1] + list(range(1, int(text_len/3))))
    for i in random.choices(range(text_len), k = k):
        seq[i] = ""
    dropped_rate = k/text_len
    dropped = "".join(seq)
    return pd.Series([dropped, dropped_rate], index=["dropped", "dropped_rate"])

dropped = df.seq.apply(word_dropping)
df["dropped"] = dropped.dropped
df["dropped_rate"] = dropped.dropped_rate
df.head()

Unnamed: 0,seq,masked,masked_rate,filled,back_translated,dropped,dropped_rate
0,我对你仍有爱意,我对[MASK]仍有[MASK]意,0.285714,我对我仍有同意,我仍然爱着你,我对仍有爱意,0.142857
1,我对自己无能为力,我对[MASK][MASK]无能为力,0.25,我对我，无能为力,我对自己无能为力。,我对自无能为力,0.125
2,讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常...,[MASK]一个同学[MASK][MASK]儿[MASK]那[MASK][MASK]小[MA...,0.439791,那一个同学一个儿子那一一小孩二小级数学课老师讲话的时候一一个同桌一个儿子的小孩很贪玩也很寻常...,让我们谈谈一个同学。当他二年级的数学老师讲课时，这位同学和他的同桌一起玩。同样不寻常的是，数...,讲一个同学的事儿吧那年他小学二年数课老师讲课的时候学跟同桌一块玩儿小孩儿贪玩也寻常不寻常是数...,0.094241
3,我看了上帝之眼感觉还不错,我[MASK]了[MASK][MASK][MASK]眼感[MASK]还[MASK]错,0.583333,,我看到了上帝的眼睛，感觉很好。,我了帝之眼感觉不错,0.25
4,不喜欢那些讲技术的摄影书,[MASK]喜欢那些[MASK][MASK][MASK]的[MASK][MASK]书,0.75,,我不喜欢那些谈论技术的摄影书籍。,不喜欢些讲技术的摄影书,0.083333


# Blend these all

In [33]:
import json
import pandas as pd

with open("./webtext2019zh/web_text_zh_train_sample.json", "r") as f:
    content = f.readlines()

data = map(json.loads, content)
data = pd.DataFrame(data)

text = "\n".join(data.content.values)
references = cut_sentences(text)
references[:5]

['我对你仍有爱意',
 '我对自己无能为力',
 '讲一个同学的事儿吧那年他小学二年级数学课老师讲课的时候同学跟同桌一块儿玩儿小孩儿贪玩也很寻常不寻常的是数学老师勃然大怒抓起我同学的衣领拎着他就直接走出教室教室是在四楼这老师竟然直接就把他悬在了走道护杆外的半空同学直接吓尿了（是真的尿）这老师还在骂骂咧咧的威胁以后还敢不敢上课瞎捣蛋结局同学还是安然被“收了回来”但从此他不敢在数学课上放肆从此他的数学成绩没有突破过及格线一个好的老师传',
 '我看了上帝之眼感觉还不错',
 '不喜欢那些讲技术的摄影书']

In [28]:
import logging
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [39]:
%%time
def make_candidates(references):
    """
    30% with mask filling rule1: scored by masked_rate
    30% with mask filling rule2: scored by masked_rate
    30% with back translation and word dropping: scored by dropped rate
    10% with back translation: score 0.98
    
    Returns
    -------
    candidates: Generated candidates with the same length of references
    scores: Arbitrary scores
    """
    # Do not modify input params
    references = references.copy()
    random.shuffle(references)
    refs = []
    
    ref_len = len(references)

    # Apply mask filling
    logger.info("Apply mask filling ...")
    mf1_len = mf2_len = int(ref_len*0.3)
    candidates = []
    scores = []

    # Mask filling
    mf1 = map(mask_replacing, references[:mf1_len])
    refs += references[:mf1_len]
    del references[:mf1_len]
    mf2 = map(mask_replacing2, references[:mf2_len])
    refs += references[:mf2_len]
    del references[:mf2_len]
    mf = pd.DataFrame(list(mf1) + list(mf2))
    mf_filled = mf.masked.apply(mask_filling)

    candidates = mf_filled.tolist()
    scores += (1 - mf.masked_rate).values.tolist()
    
    # Back translation (bt)
    logger.info("Apply back translation ...")
    bt = parallelize(references, lambda refs: BackTranslation().back_translation(refs))
    # Drop samples where refs and back translationed excactly same
    df_bt = pd.DataFrame({
        "refs": references,
        "bt": bt
    })
    df_bt = df_bt[df_bt.refs != df_bt.bt]

    # Replace references and bt for later use
    logger.info("Dropped {} samples".format(len(bt) - df_bt.shape[0]))
    refs += df_bt.refs.tolist()
    references = df_bt.refs.tolist()
    bt = df_bt.bt.tolist()
    
    # Apply 30% with word dropping
    wd_len = int(df_bt.shape[0] * 0.75)
    bt_dropped = map(word_dropping, bt[:wd_len])
    bt_dropped = pd.DataFrame(bt_dropped)
    candidates += bt_dropped.dropped.tolist()
    scores += (1 - bt_dropped.dropped_rate).tolist()
    
    del bt[:wd_len]
    candidates += bt
    scores += [0.98] * len(bt)

    return refs, candidates, scores
   
[refs, candidates, scores] = make_candidates(references)

INFO:root:Apply mask filling ...
2020-08-14 12:18:01,172 INFO     Apply mask filling ...
INFO:root:Apply back translation ...
2020-08-14 12:18:01,173 INFO     Apply back translation ...
INFO:root:Dropped 85 samples
2020-08-14 12:18:05,964 INFO     Dropped 85 samples


CPU times: user 559 ms, sys: 11.6 ms, total: 570 ms
Wall time: 5.16 s


In [40]:
len(refs), len(candidates), len(scores)

(931, 931, 931)

In [15]:
dataset = pd.DataFrame({
    "reference": refs,
    "candidate": candidates,
    "score": scores
})

In [17]:
dataset[dataset.score>0.95]

Unnamed: 0,reference,candidate,score
3,财企2007194号给你们参考：一研发活动直接消耗的材料燃料和动力费用,财企业007##19##4号给你们参考：一研发活动直接消耗的材料燃料和动力费用,0.971429
49,反正黄赌毒各种我价值观上不好的行为我都猜了一遍,反正黄赌上各种我价值观上不好的行为我都猜了一遍,0.956522
615,看到这一幕我简直兴奋得要上天了更新于上一个答案几个小时后剧透醒目论这个世界上还有什么比妖怪夫...,看到这一幕，我太兴奋了，我要去天堂了。我更新了最后一个答案几个小时后，剧引人注目。世界上没有...,0.953125
644,你觉得我们要不要使用千度的代码机器或者370的代码机器,你认为我们应该用几千台码机还是 70 台码机,0.956522
656,叙利亚西北部伊德利卜省出现的使用化学武器造成大规模平民伤亡,叙利亚西北部Idleb省使用化学武器造成大规模民伤亡,0.962963
677,我也没找到下载的链接）：1支持JVM（支持Java应用）2不开源3号称十万个应用4能在移动设备,"我也有找到下载链接):1 支持JVM 支持Java应用程序) 2 不开源 3 声称 100,...",0.970588
685,二企业在职研发人员的工资奖金津贴补贴社会保险费住房公积金等人工费用以及外聘研发人员的劳务费,第二，企业在职R & D人员的工资和金补贴社会保险等劳动成本，住房公积金和外部研发人员的劳动...,0.961538
696,女主盯着我看了半天我以为我衣服穿错了,女主角盯着我看了很久。我以为我穿衣服了。,0.952381
714,先说一说毁灭之锤奥格瑞姆你个锤子！这是黑暗之门后各位兽人酋长发自内心的咆哮这个是还在当角斗士的萨尔,先说说毁灭之锤吧，奥格丽德，你这个锤子!这是黑暗之门后兽人首领内心的咆哮。这是萨尔，他仍然是...,0.980769
721,让一些不牛逼的大学也有一些学术性理论性很强的专业,让一些不牛逼的大学有一些学术和理论专业。,0.952381


In [18]:
dataset.to_csv("./data_generated/dataset.csv", index=None)

In [343]:
# @TODO randomly mix other 0 scored references

In [19]:
data = pd.read_csv("./data_generated/dataset.csv")

In [43]:
dataset.to_csv("./data_generated/dataset.csv", index=None, header=None)

In [44]:
dataset.to_csv("./data_generated/dataset.csv", index=None, header=None, mode='a')

In [45]:
data = pd.read_csv("./data_generated/dataset.csv")

In [48]:
dataset.shape

(1016, 3)

In [51]:
dataset.apply(lambda)

reference


In [53]:
row = dataset.iloc[0]

In [63]:
from os import path

def save_data(dataset):
    """
    Save data to csv and jsonl
    jsonl example: {"candidate":"吴承恩是著名文学家","reference":"吴承恩是著名文学家","score":1}
    """
    csv_file = "./data_generated/dataset.csv"
    jsonl_file = "./data_generated/dataset.jsonl"
    
    mode = "w"
    if path.exists(csv_file):
        mode = "a"
        
    dataset.to_csv(csv_file, index=None, header=None, mode=mode)
    
    def write_row(f, row):
        f.write(row.to_json(force_ascii=False) + "\n")
    
    with open(jsonl_file, mode) as f:
        dataset.apply(lambda row: write_row(f, row), axis=1)
        
save_data(dataset)

In [69]:
checkpoint = 0
batch_size = 30
content = []
for i in range(100):
    if i > checkpoint and i <= (checkpoint + batch_size):
        content.append(i)
content

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30]

In [117]:
import time
def readfile(filename, checkpoint = 0, batch_size = 300):
    content = []
    i = 0
    with open(filename, "r") as f:
        while True:
            i += 1
            if i > checkpoint and i <= (checkpoint + batch_size):
                line = f.readline()
                if line != "":
                    content.append(line)
                else:
                    checkpoint = None
                    break
            elif i <= checkpoint:
                next(f)
            else:
                checkpoint = i - 1
                break
    return content, checkpoint

checkpoint = 0
for i in range(10):
    [content, checkpoint] = readfile("./webtext2019zh/web_text_zh_train_sample.json", 
                                     checkpoint = checkpoint, 
                                     batch_size = 21)
    time.sleep(0.5)
    print(len(content), checkpoint)
    
    if checkpoint is None:
        break

21 21
21 42
21 63
21 84
16 None


In [108]:
content[-1]

'{"qid": 58303597, "title": "空姐的行李箱里有什么？", "desc": "", "topic": "空乘", "star": 19, "content": "想知道吗？ 我打开给你看啊。", "answer_id": 165171803, "answerer_tags": "斯人若彩虹，遇上方知有。"}\n'

In [104]:
len(content)

25

In [119]:
1e5

100000.0

# Post process

Generate negetive samples scored 0 and positive samples scored 1,
which each part has the 10% size of origin dataset,
split into train/dev set

In [2]:
import pandas as pd

In [3]:
dataset = pd.read_csv("./data_generated/dataset.csv", header=None, names=["reference", "candidate", "score"])

In [11]:
dataset.shape

(9077648, 3)

In [12]:
n_neg_samples = int(dataset.shape[0]/5)
n_pos_samples = int(dataset.shape[0]/5)

neg_samples = dataset.sample(n_neg_samples).reset_index(drop=True)
neg_samples["candidate"] = dataset["reference"].sample(n_neg_samples).values
neg_samples["score"] = 0

pos_samples = dataset.sample(n_neg_samples).reset_index(drop=True)
pos_samples["candidate"] = pos_samples["reference"].values
pos_samples["score"] = 1

In [13]:
df = pd.concat([
    dataset, neg_samples, pos_samples
], ignore_index=True)

In [14]:
df = df.dropna()

In [15]:
from sklearn.model_selection import train_test_split
train_set, dev_set = train_test_split(df, test_size=0.01)

In [16]:
train_set.score.describe()

count    1.258010e+07
mean     6.477867e-01
std      3.465629e-01
min      0.000000e+00
25%      4.000000e-01
50%      7.857143e-01
75%      9.285714e-01
max      1.000000e+00
Name: score, dtype: float64

In [17]:
train_set.shape

(12580095, 3)

In [19]:
%%time
# Write to jsonl file

train_set.to_json("./data_generated/rating_train.jsonl", orient='records', force_ascii=False, lines=True)
dev_set.to_json("./data_generated/rating_dev.jsonl", orient='records', force_ascii=False, lines=True)

CPU times: user 32.4 s, sys: 9.09 s, total: 41.5 s
Wall time: 44 s
