In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm
import nltk
import json
import pickle
import os
import regex as re
import collections
from opencc import OpenCC

from pytorch_transformers import BertTokenizer

In [2]:
def replacePunc(x):
    x = x.replace("，", ",").replace("‘", "'").replace("“", '"').replace("”", '"').replace("；", ";").\
        replace("：", ':').replace("（", "(").replace("）", ")").replace("？", "?").replace("【", "[").\
        replace("】", ']').replace("『", "{").replace("』", "}").replace("！", "!").replace("～", "~").\
        replace("—", "-")
    return x

def filtering(x):
    pattern = r"[^\u4e00-\u9fff0-9A-Za-z\s+\.\!\/_,：:;-|$%^*()+\"\'+——！，。？、《》“”~@#￥%…&*（）]+"
    return re.sub(pattern, "", x)

def removeURL(x):
    return re.sub(r'https?:\/\/[A-Za-z0-9.\/\-]*', '', x)

cc = OpenCC('t2s')

def clean_data(x):
    x = replacePunc(x.strip()).replace("\xa0", "").replace(" ", "").lower()
    x = cc.convert(x)
    x = filtering(x)
    x = removeURL(x)
    x = x.replace("\t", " ")
    return x

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [4]:
train_df = pd.read_json("../../data/train.json")
val_df = pd.read_json("../../data/val.json")
test_df = pd.read_json("../../data/test.json")

In [5]:
def process(df):
    data = []

    for idx in tqdm.tqdm_notebook(range(len(df))):
        summary = df.iloc[idx,:]['summarization'][0]
        origin_title = df.iloc[idx,:]['origin_title']
        third_title = df.iloc[idx,:]['third_title']

        summary = clean_data(summary)
        origin_title = clean_data(origin_title)
        third_title = clean_data(third_title)

        summary = tokenizer.tokenize(summary)
        origin_title = tokenizer.tokenize(origin_title)
        third_title = tokenizer.tokenize(third_title)

        data.append((summary, origin_title, third_title))
    return data

In [6]:
train_data = process(train_df)
val_data = process(val_df)
test_data = process(test_df)

HBox(children=(IntProgress(value=0, max=22908), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




In [7]:
torch.save(train_data, "train_tokenized.pkl")
torch.save(val_data, "val_tokenized.pkl")
torch.save(test_data, "test_tokenized.pkl")