In [4]:
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
import csv
from tqdm import tqdm
from transformers import BartTokenizer, BartModel

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    s = s.lower()
    return s

def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    ss = []
    for x in cleaned:
        if 30>=len(x.split(' '))>=3 :
            ss.append(x)
    return ss

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    sentences = raw_data.strip().split('\n')
    raw_data_input = [normalize_string(data) for data in sentences]
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument('--input-data-dir', default='europarl/en', type=str)
parser.add_argument('--output-train-dir', default='BART/train_data.pkl', type=str)
parser.add_argument('--output-test-dir', default='BART/test_data.pkl', type=str)

    
args = parser.parse_args(args=[])
data_dir = './data/'

args.input_data_dir = data_dir + args.input_data_dir
args.output_train_dir = data_dir + args.output_train_dir
args.output_test_dir = data_dir + args.output_test_dir

sentences = []
print('Preprocess Raw Text')
for fn in tqdm(os.listdir(args.input_data_dir)):
    if not fn.endswith('.txt'): continue
    process_sentences = process(os.path.join(args.input_data_dir, fn))
    sentences += process_sentences

print('Number of sentences: {}'.format(len(sentences)))

# use BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

print('Start encoding txt')
results = []
for seq in tqdm(sentences):
    tokens = tokenizer(seq)['input_ids']
    # tokens = tokenizer(seq, return_tensors="pt")['input_ids']
    results.append(tokens)

print('Writing Data')
train_data = results[: round(len(results) * 0.9)]
test_data = results[round(len(results) * 0.9):]

with open(args.output_train_dir, 'wb') as f:
    pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
    pickle.dump(test_data, f)

# spot check
for i in range(10):
    print(sentences[i])
    print(results[i])

Preprocess Raw Text


100%|██████████| 9672/9672 [02:11<00:00, 73.42it/s] 


Number of sentences: 119032
Start encoding txt


100%|██████████| 119032/119032 [00:20<00:00, 5914.17it/s]

Writing Data
resumption of the session
[0, 1535, 21236, 9, 5, 1852, 2]
 the house rose and observed a minute s silence 
[0, 5, 790, 1458, 8, 6373, 10, 2289, 579, 7308, 1437, 2]
that is precisely the time when you may if you wish raise this question i .e . on thursday prior to the start of the presentation of the report .
[0, 6025, 16, 12810, 5, 86, 77, 47, 189, 114, 47, 2813, 1693, 42, 864, 939, 479, 242, 479, 15, 3553, 46806, 2052, 7, 5, 386, 9, 5, 5209, 9, 5, 266, 479, 2]
this is all in accordance with the principles that we have always upheld .
[0, 9226, 16, 70, 11, 10753, 19, 5, 7797, 14, 52, 33, 460, 14817, 479, 2]
thank you mr segni i shall do so gladly . indeed it is quite in keeping with the positions this house has always adopted .
[0, 31653, 47, 475, 338, 842, 16993, 118, 939, 5658, 109, 98, 36811, 479, 5329, 24, 16, 1341, 11, 2396, 19, 5, 2452, 42, 790, 34, 460, 5091, 479, 2]
yes mrs schroedter i shall be pleased to look into the facts of this case when i have received your 




In [7]:
print(len(results))
max_len = 0
for i in range(len(results)):
    tmp = len(results[i])
    if tmp > max_len:
        max_len = tmp
print(max_len)

119032
69
