In [4]:
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
import csv
from tqdm import tqdm
from transformers import BertTokenizer, BartTokenizer

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    s = s.lower()
    return s

def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    ss = []
    for x in cleaned:
        if 30>=len(x.split(' '))>=3 :
            ss.append(x)
    return ss

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    sentences = raw_data.strip().split('\n')
    raw_data_input = [normalize_string(data) for data in sentences]
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument('--input-data-dir', default='europarl/en', type=str)
parser.add_argument('--output-train-dir', default='BERT&BART/train_data.pkl', type=str)
parser.add_argument('--output-test-dir', default='BERT&BART/test_data.pkl', type=str)

    
args = parser.parse_args(args=[])
data_dir = '../data/'

args.input_data_dir = data_dir + args.input_data_dir
args.output_train_dir = data_dir + args.output_train_dir
args.output_test_dir = data_dir + args.output_test_dir

sentences = []
print('Preprocess Raw Text')
for fn in tqdm(os.listdir(args.input_data_dir)):
    if not fn.endswith('.txt'): continue
    process_sentences = process(os.path.join(args.input_data_dir, fn))
    sentences += process_sentences

print('Number of sentences: {}'.format(len(sentences)))

# Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

print('Start encoding txt')
results = []
for seq in tqdm(sentences):
    bert_tokens = bert_tokenizer(seq)['input_ids']
    bart_tokens = bart_tokenizer(seq)['input_ids']
    results.append([bert_tokens, bart_tokens])

print('Writing Data')
train_data = results[: round(len(results) * 0.9)]
test_data = results[round(len(results) * 0.9):]

with open(args.output_train_dir, 'wb') as f:
    pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
    pickle.dump(test_data, f)

# spot check
for i in range(10):
    print(sentences[i])
    print(results[i])

Preprocess Raw Text


100%|██████████| 9672/9672 [01:42<00:00, 94.67it/s] 


Number of sentences: 119032
Start encoding txt


100%|██████████| 119032/119032 [01:04<00:00, 1856.17it/s]


Writing Data
resumption of the session
[[101, 24501, 24237, 3508, 1997, 1996, 5219, 102], [0, 1535, 21236, 9, 5, 1852, 2]]
 the house rose and observed a minute s silence 
[[101, 1996, 2160, 3123, 1998, 5159, 1037, 3371, 1055, 4223, 102], [0, 5, 790, 1458, 8, 6373, 10, 2289, 579, 7308, 1437, 2]]
that is precisely the time when you may if you wish raise this question i .e . on thursday prior to the start of the presentation of the report .
[[101, 2008, 2003, 10785, 1996, 2051, 2043, 2017, 2089, 2065, 2017, 4299, 5333, 2023, 3160, 1045, 1012, 1041, 1012, 2006, 9432, 3188, 2000, 1996, 2707, 1997, 1996, 8312, 1997, 1996, 3189, 1012, 102], [0, 6025, 16, 12810, 5, 86, 77, 47, 189, 114, 47, 2813, 1693, 42, 864, 939, 479, 242, 479, 15, 3553, 46806, 2052, 7, 5, 386, 9, 5, 5209, 9, 5, 266, 479, 2]]
this is all in accordance with the principles that we have always upheld .
[[101, 2023, 2003, 2035, 1999, 10388, 2007, 1996, 6481, 2008, 2057, 2031, 2467, 16813, 1012, 102], [0, 9226, 16, 70, 11, 1075

In [7]:
inputs = bert_tokenizer("[CLS][PAD][UNK][SEP]", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[101, 101,   0, 100, 102, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [21]:
print(len(results))
max_len_bert = max(len(results[i][0]) for i in range(len(results)))
max_len_bart = max(len(results[i][1]) for i in range(len(results)))

print('max token len of bert: ', max_len_bert)
print('max token len of bart: ', max_len_bart)

119032
max token len of bert:  65
max token len of bart:  69
