In [42]:
# 从SQuAD 2.0出去文本数据，加入随机噪音生成训练集。
# 比较不同的model的纠错效果。
# 英文数据

import json
import tensorflow as tf
import math
import random
import nltk
from nltk.tokenize import word_tokenize
import string
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed
import pandas as pd
import sys
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, LSTM, Embedding

english_text_file = "./input_data/abstract_english_text_file"
clean_english_text_file = "./input_data/clean_english_text_file"
before_add_error_file = "./input_data/before_add_error"
after_add_error_file = "./input_data/after_add_error"
change_index_file = "./input_data/change_index_file"

err_prob = {
    "replace_one_char": 0.4,
    "add_one_char": 0.2,
    "delete_one_char": 0.2,
    "change_neighbor_order": 0.2
}

# max_error_rate = 0.2
char_list = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
max_error_line_length_rate = 0.05

# 函数用来parse SQuAD 2.0的数据
# 我们只需要抽取文本信息, SQuAD2.0的数据格式比较奇怪
def read_squad_examples(input_file, is_training):
    with tf.io.gfile.GFile(input_file, "r") as reader:
        input_data = json.load(reader)["data"]
    return input_data
    
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# 抽取英文文本，中文和其他类别的去掉
def abstarct_sentence(json_data):
    global english_text_file
    fp = open(english_text_file, "w")
    for article in json_data:
        paragraphs = article["paragraphs"]
        for paragraph in paragraphs:
            qas = paragraph["qas"]
            for qa in qas:
                question = qa["question"]
                if isEnglish(question):
                    fp.write(question + "\n")
                answer_struct = qa["answers"]
                if (len(answer_struct) > 0):
                    answer = answer_struct[0]["text"]
                    if isEnglish(answer):
                        fp.write(answer + "\n")
    fp.close()
    print("abstract finish")

def clean_file(file=english_text_file):
    global clean_english_text_file
    fp = open(file, "r")
    fp2 = open(clean_english_text_file, "w")
    for line in fp:
        line = line.strip()
        if line != "" and line != "null" and len(line) > 10:
            fp2.write(line + "\n")
    fp.close()
    fp2.close()

def map_prob_to_range_with_keys(key_prob):
    prob_list = list(key_prob.values())
    prob_sum = sum(map(float,prob_list))
    key_range = dict()
    if math.isclose(prob_sum, 1) == False:
        print("prob sum is not 1")
        sys.exit(-1)
    else:
        threshold = 0
        for key, prob in key_prob.items():
            key_range[key] = [threshold, threshold + prob]
            threshold += prob
    return key_range

def choose_item_based_on_prob(key_range):
    value = random.uniform(0, 1)
    last_key = None
    for key, prob_range in key_range.items():
        last_key = key
        if value >= prob_range[0] and value < prob_range[1]:
            return key
    return last_key

# 随机修改正确的句子到错误的句子
# 包括添加字符，删除字符，交换临近字符，替换字符
def add_error_to_line(line):
    max_error_line_length_rate
    max_error_num = (int)(max_error_line_length_rate * len(line))
    set_error_num = (int)(random.uniform(0, 1) * max_error_num)
    cur_error_num = 0
    before = line
    after = line
    key_range = map_prob_to_range_with_keys(err_prob)
    while cur_error_num < set_error_num:
        err_type = choose_item_based_on_prob(key_range)
        cur_error_num += 1
        if err_type == "replace_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + random_choice(char_list[:-1]) \
            + after[random_char_position + 1:]
        elif err_type == "add_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + random_choice(char_list[:-1]) \
            + after[random_char_position:]
        elif err_type == "delete_one_char":
            random_char_position = random_randint(len(after))
            after = after[:random_char_position] + after[random_char_position + 1:]
        elif err_type == "change_neighbor_order":
            random_char_position = random_randint(len(after) - 1)
            after = (after[:random_char_position] + after[random_char_position + 1] \
            + after[random_char_position] + after[random_char_position + 2:])
    return before, after

def map_line_to_int(line, char2int_table):
    encoded_seq = [char2int_table[char] for char in line]
    str_array = map(lambda x: str(x), encoded_seq)
    return ",".join(str_array)

def gen_X_y(char2int_table):
    global clean_english_text_file, before_add_error_file, after_add_error_file
    X = []
    Y = []
    X_encoding = []
    Y_encoding = []
    before_file = open(before_add_error_file, "w")
    after_file = open(after_add_error_file, "w")
    change_index_fp = open(change_index_file, "w")
    ori_file = open(clean_english_text_file, "r")
    line_num = 1
    for line in ori_file:
        line = line.strip("\n")
        correct, mistaken = add_error_to_line(line)
        X.append([mistaken])
        Y.append([correct])
        before_file.write(correct + "\n")
        after_file.write(mistaken + "\n")
        if correct != mistaken:
            change_index_fp.write(str(line_num) + "\n")
        line_num += 1
        X_encoding.append(map_line_to_int(mistaken, char2int_table))
        Y_encoding.append(map_line_to_int(correct, char2int_table))
    pd_X = pd.DataFrame(X_encoding, columns=["text"])
    pd_Y = pd.DataFrame(Y_encoding, columns=["text"])
    return pd_X, pd_Y

# DL要求将字符转成int作为输入
def char2int(file=clean_english_text_file):
    fp = open(file, "r")
    all_text = fp.read()
    fp.close()
    chars = sorted(list(set(all_text)))
    char2int_mapping =dict((c, i) for i, c in enumerate(chars))
    return char2int_mapping, len(char2int_mapping)
    

In [43]:
# 1. 抽取SQuAD 2.0的问答数据
json_data = read_squad_examples("./input_data/train-v2.0.json", False)
abstarct_sentence(json_data)
clean_file()

abstract finish


In [44]:
# 2. 生成码表
char2int_table, table_size = char2int()
print(char2int_table)
print(table_size)

# 2. 在抽取的文件中加入错误的噪音
pd_X, pd_Y = gen_X_y(char2int_table)
# 确保行数一样
if len(pd_X.index) != len(pd_Y.index):
    print("X and Y don't have same size")
    print(pd_X.info())
    print(pd_Y.info())
    sys.exit(-1)


{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '<': 29, '=': 30, '>': 31, '?': 32, '@': 33, 'A': 34, 'B': 35, 'C': 36, 'D': 37, 'E': 38, 'F': 39, 'G': 40, 'H': 41, 'I': 42, 'J': 43, 'K': 44, 'L': 45, 'M': 46, 'N': 47, 'O': 48, 'P': 49, 'Q': 50, 'R': 51, 'S': 52, 'T': 53, 'U': 54, 'V': 55, 'W': 56, 'X': 57, 'Y': 58, 'Z': 59, '[': 60, '\\': 61, ']': 62, '_': 63, '`': 64, 'a': 65, 'b': 66, 'c': 67, 'd': 68, 'e': 69, 'f': 70, 'g': 71, 'h': 72, 'i': 73, 'j': 74, 'k': 75, 'l': 76, 'm': 77, 'n': 78, 'o': 79, 'p': 80, 'q': 81, 'r': 82, 's': 83, 't': 84, 'u': 85, 'v': 86, 'w': 87, 'x': 88, 'y': 89, 'z': 90, '{': 91, '|': 92, '}': 93, '~': 94}
95


In [46]:
print(pd_X[0:5])
print(pd_Y[0:5])

                                                text
0  56,72,69,78,1,68,73,68,1,35,69,89,79,78,67,69,...
1    73,78,1,84,72,69,1,76,65,84,69,1,18,26,26,17,83
2  56,72,65,84,1,65,82,69,65,83,1,68,73,68,1,35,6...
3  83,73,78,71,73,78,71,1,65,78,68,1,68,65,78,67,...
4  56,72,69,78,1,68,73,68,1,35,69,89,79,78,67,69,...
                                                text
0  56,72,69,78,1,68,73,68,1,35,69,89,79,78,67,69,...
1    73,78,1,84,72,69,1,76,65,84,69,1,18,26,26,17,83
2  56,72,65,84,1,65,82,69,65,83,1,68,73,68,1,35,6...
3  83,73,78,71,73,78,71,1,65,78,68,1,68,65,78,67,...
4  56,72,69,78,1,68,73,68,1,35,69,89,79,78,67,69,...


In [56]:
# 要将Y向量转为one-hot来算softmax loss
one_hot_pd_Y = []
for row in pd_Y.iterrows():
    row_transfer = []
    array = row[1]["text"].split(",")
    int_array = list(map(int, array))
    transfer = to_categorical(int_array, num_classes=table_size)
    row_transfer.append(transfer)
    one_hot_pd_Y.append(row_transfer)
    break

pd_Y_transfer = pd.DataFrame(one_hot_pd_Y, columns=["text"])
print(pd_Y_transfer[0:5])


[56, 72, 69, 78, 1, 68, 73, 68, 1, 35, 69, 89, 79, 78, 67, 69, 1, 83, 84, 65, 82, 84, 1, 66, 69, 67, 79, 77, 73, 78, 71, 1, 80, 79, 80, 85, 76, 65, 82, 32]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [None]:
def simple_char_model():
    print('Build model...')
    model = Sequential()
    # 输入序列变长，timestep设置为None.
    # LSTM作为首层才需要设置input_shape参数。
    model.add(LSTM(100, input_shape=(None, 1)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

model1 = simple_char_model()
model1.fit(pd_X, one_hot_pd_Y)
