In [1]:
import pandas as pd
import numpy as np
import random
import torch
import tqdm
import nltk
import json
import pickle
import os
import regex as re
import collections
from opencc import OpenCC

from transformers import BertTokenizer

In [2]:
np.random.seed(123)

In [3]:
def replacePunc(x):
    x = x.replace("，", ",").replace("‘", "'").replace("“", '"').replace("”", '"').replace("；", ";").\
        replace("：", ':').replace("（", "(").replace("）", ")").replace("？", "?").replace("【", "[").\
        replace("】", ']').replace("『", "{").replace("』", "}").replace("！", "!").replace("～", "~").\
        replace("—", "-")
    return x

def filtering(x):
    pattern = r"[^\u4e00-\u9fff0-9A-Za-z\s+\.\!\/_,：:;-|$%^*()+\"\'+——！，。？、《》“”~@#￥%…&*（）]+"
    return re.sub(pattern, "", x)

def removeURL(x):
    return re.sub(r'https?:\/\/[A-Za-z0-9.\/\-]*', '', x)

cc = OpenCC('t2s')

def clean_data(x):
    x = replacePunc(x.strip()).replace("\xa0", "").replace(" ", "").lower()
    x = cc.convert(x)
    x = filtering(x)
    x = removeURL(x)
    x = x.replace("\t", " ")
    return x

## Split into train, val, and test


In [4]:
data_path = "../data/PHED/"

In [5]:
with open(os.path.join(data_path, "phed_data.jsonl")) as f:
    data = f.readlines()

In [6]:
indices = np.arange(len(data))
np.random.shuffle(indices)

In [7]:
with open(os.path.join(data_path, "val.jsonl"), "w") as f:
    for idx in indices[:1500]:
        f.write(data[idx])

with open(os.path.join(data_path, "test.jsonl"), "w") as f:
    for idx in indices[1500:3000]:
        f.write(data[idx])
        
with open(os.path.join(data_path, "train.jsonl"), "w") as f:
    for idx in indices[3000:]:
        f.write(data[idx])


## Tokenization

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [9]:
def process(filename):
    # read file and process
    with open(filename) as f:
        raw_data = f.readlines()

    data = []
    for line in tqdm.tqdm_notebook(raw_data):
        line = json.loads(line)
        summary = line['summary']
        original = line['original_headline']
        edited = line['edited_headline']

        summary = clean_data(summary)
        original = clean_data(original)
        edited = clean_data(edited)

        summary = tokenizer.tokenize(summary)
        original = tokenizer.tokenize(original)
        edited = tokenizer.tokenize(edited)

        data.append((summary, original, edited))
    return data

In [10]:
train_data = process(os.path.join(data_path, "train.jsonl"))
val_data = process(os.path.join(data_path, "val.jsonl"))
test_data = process(os.path.join(data_path, "test.jsonl"))

HBox(children=(IntProgress(value=0, max=22996), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




In [11]:
torch.save(train_data, os.path.join(data_path, "train_tokenized.pkl"))
torch.save(val_data, os.path.join(data_path, "val_tokenized.pkl"))
torch.save(test_data, os.path.join(data_path, "test_tokenized.pkl"))