In [None]:
import os
import re
import json
import jieba
import pathlib
from os import path 

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

from multiprocessing import Pool

# 0. 配置等相关

### 0.1 基本路径、训练参数等配置

In [None]:
# 路径配置
# ===================路径相关==================
# 输入相关base路径
in_base_dir = '/kaggle/input'
train_data_path = path.join(in_base_dir, 'text-summary', 'train.csv')
test_data_path =  path.join(in_base_dir, 'text-summary', 'test.csv')

stopwords_path = path.join(in_base_dir, 'english-and-chinese-stopwords', 'stopwords.txt')

# 输出相关base路径
out_base_dir = '/kaggle/working'
model_dir = path.join(out_base_dir, 'model')
data_dir =  path.join(out_base_dir, 'data')
result_dir = path.join(out_base_dir, 'results')

for path_dir in [model_dir, data_dir, result_dir]:
    if not path.exists(path_dir):
        os.mkdir(path_dir)


### 处理后的数据输出路径
train_seg_path = path.join(data_dir, 'train_seg_data.csv')
test_seg_path = path.join(data_dir, 'test_seg_data.csv')


merged_seg_path = path.join(data_dir, 'merged_train_test_seg_data.csv')  #合并train/test，构造用于训练词向量的数据


save_w2v_model_path = path.join(model_dir, 'word2vector', 'word2vec.model') # 词向量模型路径
embedding_matrix_path = path.join(model_dir, 'word2vector', 'embedding_matrix') # 词向量矩阵路径


vocab_index_to_key_path = path.join(model_dir, 'word2vector', 'vocab_index_to_key.json')# 词向量词汇表路径
vocab_key_to_index_path = path.join(model_dir, 'word2vector', 'vocab_key_to_index.json')


train_x_seg_path = path.join(data_dir, 'train_x_seg_data.csv')# train/test数据与标签的路径
train_y_seg_path = path.join(data_dir, 'train_y_seg_data.csv')
test_x_seg_path = path.join(data_dir, 'test_x_seg_data.csv')
test_y_seg_path = path.join(data_dir, 'test_y_seg_data.csv')


train_x_pad_path = path.join(data_dir, 'train_x_pad_data.csv')# train/test数据与标签，pad处理后的路径
train_y_pad_path = path.join(data_dir, 'train_y_pad_data.csv')
test_x_pad_path = path.join(data_dir, 'test_x_pad_data.csv')
test_y_pad_path = path.join(data_dir, 'test_y_pad_data.csv')


train_x_path = path.join(data_dir, 'train_X')# train/set数据与标签，转换成索引形式后路径
train_y_path = path.join(data_dir, 'train_Y')
test_x_path = path.join(data_dir, 'test_X')
test_y_path = path.join(data_dir, 'test_Y')


default_checkpoint_dir = model_dir # 训练结果文件保持路径
# test_save_dir = path.join(results_dir, 'demotest')

# ===================参数相关==================
# 词向量配置
embedding_dim = 100
word2vec_train_epochs = 5

# 训练参数配置
batch_size = 8
epochs = 10

max_enc_len = 400
max_dec_len = 100

vocab_size = 30000

beams_size = batch_size

# samples
sample_total = 82871

# 多进程
cpu_cores = 4 * 2

### 0.2 开启模块设置

# 1. Vocab类，处理vocab等相关内容

In [None]:
def load_embedding_matrix(file_path=embedding_matrix_path, max_vocab_size=102400):
    embedding_matrix = np.load(file_path + '.npy')
    flag_matrix = np.zeros_like(embedding_matrix[:Vocab.MASKS_COUNT])
    return np.concatenate([flag_matrix, embedding_matrix])[: max_vocab_size]


def load_word2vec_model():

    return Word2Vec.load(save_w2v_model_path)


class Vocab:

    PAD_TOKEN = '<PAD>'
    UNKNOWN_TOKEN = '<UNK>'
    START_DECODING = '<START>'
    STOP_DECODING = '<STOP>'

    MASKS = [PAD_TOKEN, UNKNOWN_TOKEN, START_DECODING, STOP_DECODING]
    MASKS_COUNT = len(MASKS)

    PAD_TOKEN_INDEX = MASKS.index(PAD_TOKEN)
    UNKNOWN_TOKEN_INDEX = MASKS.index(UNKNOWN_TOKEN)
    START_DECODING_INDEX = MASKS.index(START_DECODING)
    STOP_DECODING_INDEX = MASKS.index(STOP_DECODING)

    def __init__(self, vocab_file=vocab_key_to_index_path, vocab_max_size=None):

        self.word2index, self.index2word = self.load_vocab(vocab_file, vocab_max_size)
        self.count = len(self.word2index)

    @staticmethod
    def load_vocab(file_path, vocab_max_size=None):

        word2index = {mask: index for index, mask in enumerate(Vocab.MASKS)}
        index2word = {index: mask for index, mask in enumerate(Vocab.MASKS)}

        vocab_dict = list(json.load(fp=open(file_path, 'r', encoding='utf-8')).items())[:-4]
        vocab_dict = vocab_dict if vocab_max_size is None else vocab_dict[: vocab_max_size]

        for word, index in vocab_dict:
            word2index[word] = index + Vocab.MASKS_COUNT
            index2word[index + Vocab.MASKS_COUNT] = word

        return word2index, index2word

    def word_to_index(self, word):

        return self.word2index[word] if word in self.word2index else self.word2index[self.UNKNOWN_TOKEN]

    def index_to_word(self, word_index):

        assert word_index in self.index2word, f'word index [{word_index}] not found in vocab'

        return self.index2word[word_index]

    def size(self):
        return self.count

# 2. 数据预处理，构造数据集

### 2.1 数据预处理集数据保存相关组件

In [None]:
# 数据预处理
def transform_data(sentence, vocab):

    word_list = sentence.split()
    # 按照vocab的index进行转换
    # 遇到位置此次就填充unk的索引
    idx = [vocab.word2index[word] if word in vocab.word2index else vocab.UNKNOWN_TOKEN_INDEX for word in word_list]

    return idx

def pad_proc(sentence, max_len, vocab):
    """
    填充字段
    < start > < end > < pad > < unk >
    :param sentence:
    :param x_max_len:
    :param vocab:
    :return:
    """
    # 0. 按照空格分词
    word_list = sentence.strip().split()

    # 1. 截取最大长度的词
    word_list = word_list[:max_len]
    # 2. 填充<unk>
    sentence = [word if word in vocab else Vocab.UNKNOWN_TOKEN for word in word_list]
    # 3. 填充<start>和<end>
    sentence = [Vocab.START_DECODING] + sentence + [Vocab.STOP_DECODING]
    # 4. 长度对齐
    sentence = sentence + [Vocab.PAD_TOKEN] * (max_len - len(word_list))

    return ' '.join(sentence)


def get_max_len(dataframe):
    """
    获取合适的最大长度
    :param dataframe: 带统计的数据， train_df['Question']
    :return:
    """
    max_lens = dataframe.apply(lambda x: x.count(' ') + 1)

    return int(np.mean(max_lens) + 2 * np.std(max_lens))


def load_stopwords(file_path):

    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
    f.close()

    stopwords = (word.strip() for word in stopwords)
    return stopwords


def clean_sent(sent):
    """
      :param sent: strings
      :return: 去除非中文字符
      """
    sent = re.sub(r'[^\u4e00-\u9fa5]', '', sent)
    return sent

def seg_words(sent):
    # 分词
    word_generator = jieba.cut(sent)

    # 过滤条件1
    remove_words = ['|', '[', ']', '语音', '图片', '你好', '您好']
    word_generator = (word for word in word_generator if word and word not in remove_words)

    # 过滤条件2
    stop_words = load_stopwords(file_path=stopwords_path)
    word_generator = (word for word in word_generator if word and word not in stop_words)

    return ' '.join(word_generator)

def sentence_proc(sentence):

    # 将原对话拆分为若干个句子
    sent_generator = sentence.split('|')

    # 每个句子分别进行分处理
    # 1. 去除非中文符号
    sent_generator = (clean_sent(sent) for sent in sent_generator)

    # 2. 分词处理
    sent_generator = (seg_words(sent) for sent in sent_generator)

    # 重新组合成处理后的句子
    return ' '.join(sent_generator)

def save_to_csv(dataframe, save_path=path.join(data_dir, 'result.csv'), index=False):

    assert isinstance(dataframe, pd.DataFrame) or isinstance(dataframe, pd.Series), 'Error type .'
    dataframe.to_csv(save_path, encoding='utf_8_sig', index=index)

def sentences_proc(dataframe):

    col_list = ['Brand', 'Model', 'Question', 'Dialogue', 'Report']

    for col in col_list:
        if col in dataframe.columns:
            dataframe[col] = dataframe[col].apply(sentence_proc, )

    return dataframe
    
def multi_process_csv(dataframe, func):

    # 数据切分
    data_split = np.array_split(dataframe, cpu_cores)

    # 并发处理
    with Pool(processes=cpu_cores) as pool:
        dataframe = pd.concat(pool.map(func, data_split))

    pool.close()
    pool.join()

    return dataframe
    

def pre_process(csv_file_path):

    # 0. 数据读取
    dataframe = pd.read_csv(csv_file_path)
    print(f"data size: {len(dataframe)}")

    # 1. 空值、重复值处理
    dataframe.dropna(subset=['Report'], inplace=True)
    dataframe.fillna('', inplace=True)
    dataframe.drop_duplicates(keep='first', inplace=True)

    # 2. 句子处理
    dataframe = multi_process_csv(dataframe, func=sentences_proc)

    return dataframe


def get_processed_data(train_data_path, test_data_path, is_to_save):
    """
    train data , test data 预处理
    :param train_data_path: 训练数据路径
    :param test_data_path: 测试数据路径
    :return: dataframe, 处理后数据
    """
    train_df = pre_process(train_data_path)
    test_df = pre_process(test_data_path)

    # 保存预处理后的数据
    if is_to_save:
        save_to_csv(train_df, save_path=train_seg_path)
        save_to_csv(test_df, save_path=test_seg_path)

    return train_df, test_df


def get_merged_data(train_df, test_df, is_to_save=True):
    merged_df = pd.concat([train_df, test_df])[['Question', 'Dialogue', 'Report']]
    merged_df['merged'] = merged_df.apply(lambda x: ' '.join(x), axis=1)

    if is_to_save:
        save_to_csv(merged_df['merged'], save_path=merged_seg_path)

    return merged_df['merged']

def get_train_test_split(train_df, test_df, w2v_model):

    train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    train_df['X'].to_csv(train_x_seg_path, index=None, header=False)

    test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'].to_csv(test_y_seg_path, index=None, header=False)

    # 标签为Report列
    train_df['Report'].to_csv(train_y_seg_path, index=None, header=False)
    test_df['Report'].to_csv(test_y_seg_path, index=None, header=False)

    # 填充开始、结束符号，未知词用oov，长度填充
    vocab = w2v_model.wv.index_to_key

    # 训练集和测试集的X处理
    x_max_len = max(get_max_len(train_df['X']), get_max_len(test_df['X']))

    train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))

    # 训练集和测试集的Y的处理
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab))

    test_y_max_len = get_max_len(train_df['Report'])
    test_df['Y'] = test_df['Report'].apply(lambda x: pad_proc(x, test_y_max_len, vocab))

    # 保存oov处理后的数据
    train_df['X'].to_csv(train_x_pad_path, index=False, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=False, header=False)

    test_df['X'].to_csv(test_x_pad_path, index=False, header=False)
    test_df['Y'].to_csv(test_y_pad_path, index=False, header=False)

    # oov和pad处理后的数据，词向量重新训练
    print('start retrain word2vec model')
    w2v_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    w2v_model.train(LineSentence(train_x_pad_path), epochs=word2vec_train_epochs, total_examples=w2v_model.corpus_count)

    w2v_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    w2v_model.train(LineSentence(train_y_pad_path), epochs=word2vec_train_epochs, total_examples=w2v_model.corpus_count)

    w2v_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    w2v_model.train(LineSentence(test_x_pad_path), epochs=word2vec_train_epochs, total_examples=w2v_model.corpus_count)

    # 重新保存词向量
    if not path.exists(path.dirname(save_w2v_model_path)):
        os.mkdir(path.dirname(save_w2v_model_path))
    w2v_model.save(save_w2v_model_path)
    print('finish retrain word2vec model .')

    # 更新vocab
    vocab = w2v_model.wv.index_to_key
    print(f'final w2v_model has vocabulary length: {len(vocab)}')

    # 保存到本地
    vocab_key_to_index = w2v_model.wv.key_to_index
    vocab_index_to_key = {index: key for key, index in vocab_key_to_index.items()}
    json.dump(vocab_key_to_index, fp=(open(vocab_key_to_index_path, 'w', encoding='utf-8')), ensure_ascii=False)
    json.dump(vocab_index_to_key, fp=(open(vocab_index_to_key_path, 'w', encoding='utf-8')), ensure_ascii=False)

    # 保存词向量矩阵
    embedding_matrix = w2v_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)

    # 数据集转换，将词转换成索引: [<start> 方向基 ...] -> [2, 403, ...]
    vocab = Vocab()
    train_idx_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    train_idx_y = train_df['Y'].apply(lambda x: transform_data(x, vocab))

    test_idx_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    test_idx_y = train_df['Y'].apply(lambda x: transform_data(x, vocab))

    # 数据转换成numpy数组
    train_x = np.array(train_idx_x.tolist())
    train_y = np.array(train_idx_y.tolist())

    test_x = np.array(test_idx_x.tolist())
    test_y = np.array(test_idx_y.tolist())

    # 数据保存
    np.save(train_x_path, train_x)
    np.save(train_y_path, train_y)

    np.save(test_x_path, test_x)
    np.save(test_y_path, test_y)

    return train_x, train_y, test_x, test_y

def train_word2vec(file_path=merged_seg_path):

    # 训练词向量
    model = Word2Vec(
        LineSentence(source=file_path),
        vector_size=embedding_dim,
        sg=1,
        workers=cpu_cores,
        window=5,
        min_count=5,
        epochs=word2vec_train_epochs,
    )

    return model

### 2.2 数据预处理及数据构造调度函数

In [None]:
def build_data(train_data_path, test_data_path):

    # 1. train/test data 预处理, 并保存处理后的文件到本地
    train_df, test_df = get_processed_data(train_data_path, test_data_path, is_to_save=True)
    # print(train_df.shape, test_df.shape)

    # 2. 构造train/test用于word2vec训练的数据
    merged_df = get_merged_data(train_df, test_df, is_to_save=True)

    # 3. 训练词向量
    w2v_model = train_word2vec(file_path=merged_seg_path)
    print(w2v_model)

    # 4. 构造训练与测试的的X, y
    get_train_test_split(train_df, test_df, w2v_model)

if __name__ == '__main__':
    # 构造数据集
    if flag_build_data:
        build_data(train_data_path, test_data_path)
        print('数据集、词向量等构造完成')

In [None]:
!ls /kaggle/working/model/word2vector/
!echo "======="
!ls /kaggle/working/data


In [None]:
!ls /kaggle/input/notebook3682a557ea/data
# !ls /kaggle/input/notebook3682a557ea/data