In [46]:
# 从SQuAD 2.0出去文本数据，加入随机噪音生成训练集。
# 比较不同的model的纠错效果。
# 英文数据

import json
import tensorflow as tf
import math
import random
import nltk
from nltk.tokenize import word_tokenize
import string
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed
import pandas as pd

english_text_file = "./input_data/abstract_english_text_file"
clean_english_text_file = "./input_data/clean_english_text_file"
before_add_error_file = "./input_data/before_add_error"
after_add_error_file = "./input_data/after_add_error"
change_index_file = "./input_data/change_index_file"

err_prob = {
    "replace_one_char": 0.4,
    "add_one_char": 0.2,
    "delete_one_char": 0.2,
    "change_neighbor_order": 0.2
}

# max_error_rate = 0.2
char_list = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
max_error_line_length_rate = 0.05

# 函数用来parse SQuAD 2.0的数据
# 我们只需要抽取文本信息, SQuAD2.0的数据格式比较奇怪
def read_squad_examples(input_file, is_training):
    with tf.io.gfile.GFile(input_file, "r") as reader:
        input_data = json.load(reader)["data"]
    return input_data
    
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# 抽取英文文本，中文和其他类别的去掉
def abstarct_sentence(json_data):
    global english_text_file
    fp = open(english_text_file, "w")
    for article in json_data:
        paragraphs = article["paragraphs"]
        for paragraph in paragraphs:
            qas = paragraph["qas"]
            for qa in qas:
                question = qa["question"]
                if isEnglish(question):
                    fp.write(question + "\n")
                answer_struct = qa["answers"]
                if (len(answer_struct) > 0):
                    answer = answer_struct[0]["text"]
                    if isEnglish(answer):
                        fp.write(answer + "\n")
    fp.close()
    print("abstract finish")

def clean_file(file=english_text_file):
    global clean_english_text_file
    fp = open(file, "r")
    fp2 = open(clean_english_text_file, "w")
    for line in fp:
        line = line.strip("\n")
        if line != "" and line != "null":
            fp2.write(line + "\n")
    fp.close()
    fp2.close()

def map_prob_to_range_with_keys(key_prob):
    prob_list = list(key_prob.values())
    prob_sum = sum(map(float,prob_list))
    key_range = dict()
    if math.isclose(prob_sum, 1) == False:
        print("prob sum is not 1")
        sys.exit(-1)
    else:
        threshold = 0
        for key, prob in key_prob.items():
            key_range[key] = [threshold, threshold + prob]
            threshold += prob
    return key_range

def choose_item_based_on_prob(key_range):
    value = random.uniform(0, 1)
    last_key = None
    for key, prob_range in key_range.items():
        last_key = key
        if value >= prob_range[0] and value < prob_range[1]:
            return key
    return last_key

# 随机修改正确的句子到错误的句子
# 包括添加字符，删除字符，交换临近字符，替换字符
def add_error_to_line(line):
    max_error_line_length_rate
    max_error_num = (int)(max_error_line_length_rate * len(line))
    set_error_num = (int)(random.uniform(0, 1) * max_error_num)
    cur_error_num = 0
    before = line
    after = line
    key_range = map_prob_to_range_with_keys(err_prob)
    while cur_error_num < set_error_num:
        err_type = choose_item_based_on_prob(key_range)
        cur_error_num += 1
        if err_type == "replace_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + random_choice(char_list[:-1]) \
            + after[random_char_position + 1:]
        elif err_type == "add_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + random_choice(char_list[:-1]) \
            + after[random_char_position:]
        elif err_type == "delete_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + after[random_char_position + 1:]
        elif err_type == "change_neighbor_order":
            random_char_position = random_randint(len(after) - 1)
            after = (after[:random_char_position] + after[random_char_position + 1] \
            + after[random_char_position] + after[random_char_position + 2:])
    return before, after

def add_error_to_english_file():
    global clean_english_text_file, before_add_error_file, after_add_error_file
    ori_file = open(clean_english_text_file, "r")
    before_file = open(before_add_error_file, "w")
    after_file = open(after_add_error_file, "w")
    change_index_fp = open(change_index_file, "w")
    line_num = 1
    for line in ori_file:
        line = line.strip("\n")
        before, after = add_error_to_line(line)
        before_file.write(before + "\n")
        after_file.write(after + "\n")
        if before != after:
            change_index_fp.write(str(line_num) + "\n")
        line_num += 1
    before_file.close()
    after_file.close()
    change_index_fp.close()
    print("add error finish")

# DL要求将字符转成int作为输入
def char2int(file=before_add_error_file):
    fp = open(file, "r")
    all_text = fp.read()
    fp.close()
    chars = sorted(list(set(all_text)))
    char2int_mapping =dict((c, i) for i, c in enumerate(chars))
    return char2int_mapping, len(char2int_mapping)
    

In [47]:
# 1. 抽取SQuAD 2.0的问答数据
json_data = read_squad_examples("./input_data/train-v2.0.json", False)
abstarct_sentence(json_data)
clean_file()

abstract finish


In [48]:
# 2. 在抽取的文件中加入错误的噪音
add_error_to_english_file()

add error finish


In [49]:
char2int_table, table_size = char2int()
pd_Y = pd.read_table(before_add_error_file)
pd_Y.columns = ["text"]

pd_X = pd.read_table(after_add_error_file)
pd_X.columns = ["text"]

                                                     text
173905  What is mEant by the rea where ignals cannot b...
173906                      Are basic antennas expensive?
173907                             relatively inexpensive
                                                     text
173905  What is meant by the area where signals cannot...
173906                      Are basic antennas expensive?
173907                             relatively inexpensive


In [50]:
# 将pandas的text列由英文转成int
def map_text_column_to_int(pd, char2int_table):
    ignore_index = []
    for index, row in pd.iterrows():
        sentence = row["text"]
        try:
            int_list = [char2int_table[c] for c in sentence]
            row["text"] = int_list
        except TypeError as e:
            print(sentence)
            print(index)
            print(e)
            ignore_index.append(index)
    return ignore_index 

ignore_index1 = map_text_column_to_int(pd_Y, char2int_table)
ignore_index2 = map_text_column_to_int(pd_X, char2int_table)

print(ignore_index1)
print(ignore_index2)

[]
[]


In [51]:
# 确保行数一样
print(pd_X.info())
print(pd_Y.info())
if len(DataFrame.index)

                                                     text
173905  [56, 72, 65, 84, 1, 73, 83, 1, 77, 38, 65, 78,...
173906  [34, 82, 69, 1, 66, 65, 83, 73, 67, 1, 65, 78,...
173907  [82, 69, 76, 65, 84, 73, 86, 69, 76, 89, 1, 73...


In [52]:
def simple_char_model(output_len, chars=None):
    print('Build model...')
    model = Sequential()
    
#     # "Encode" the input sequence using an RNN, producing an output of hidden_size
#     # note: in a situation where your input sequences have a variable length,
#     # use input_shape=(None, nb_feature).
#     for layer_number in range(CONFIG.input_layers):
#         model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
#                                  return_sequences=layer_number + 1 < CONFIG.input_layers))
#         model.add(Dropout(CONFIG.amount_of_dropout))
#     # For the decoder's input, we repeat the encoded input for each time step
#     model.add(RepeatVector(output_len))
#     # The decoder RNN could be multiple layers stacked or a single layer
#     for _ in range(CONFIG.output_layers):
#         model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
#         model.add(Dropout(CONFIG.amount_of_dropout))

#     # For each of step of the output sequence, decide which character should be chosen
#     model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
#     model.add(Activation('softmax'))

#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model