In [None]:
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm
from transformers import BartTokenizer

parser = argparse.ArgumentParser()
parser.add_argument('--input-data-dir', default='europarl/en', type=str)
parser.add_argument('--output-train-dir', default='europarl/train_data.pkl', type=str)
parser.add_argument('--output-test-dir', default='europarl/test_data.pkl', type=str)
parser.add_argument('--output-vocab', default='europarl/vocab.json', type=str)

SPECIAL_TOKENS = {
  '<PAD>': 0,
  '<START>': 1,
  '<END>': 2,
  '<UNK>': 3,
}

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    s = s.lower()
    return s

def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    cutted_lines = list()
    for line in cleaned:
        length = len(line.split())
        if length > MIN_LENGTH and length < MAX_LENGTH:
            line = [word for word in line.split()]
            cutted_lines.append(' '.join(line))
    return cutted_lines

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    sentences = raw_data.strip().split('\n')
    raw_data_input = [normalize_string(data) for data in sentences]
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input


if __name__ == '__main__':
    
    args = parser.parse_args(args=[])
    data_dir = './data/'

    args.input_data_dir = data_dir + args.input_data_dir
    args.output_train_dir = data_dir + args.output_train_dir
    args.output_test_dir = data_dir + args.output_test_dir
    args.output_vocab = data_dir + args.output_vocab

    print(args.input_data_dir)
    sentences = []
    print('Preprocess Raw Text')
    for fn in tqdm(os.listdir(args.input_data_dir)):
        if not fn.endswith('.txt'): continue
        process_sentences = process(os.path.join(args.input_data_dir, fn))
        sentences += process_sentences

    # remove the same sentences
    a = {}
    for set in sentences:
        if set not in a:
            a[set] = 0
        a[set] += 1
    sentences = list(a.keys())
    print('Number of sentences: {}'.format(len(sentences)))
    
    # use BartTokenizer
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

    print('Start encoding txt')
    results = []
    for seq in tqdm(sentences):
        tokens = tokenizer(seq)
        results.append(tokens)

    print(results[1],results[2])

    print('Writing Data')
    train_data = results[: round(len(results) * 0.9)]
    test_data = results[round(len(results) * 0.9):]

    with open(args.output_train_dir, 'wb') as f:
        pickle.dump(train_data, f)
    with open(args.output_test_dir, 'wb') as f:
        pickle.dump(test_data, f)


In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

print('Start encoding txt')
results = []
for seq in tqdm(sentences):
    tokens = tokenizer(seq)['input_ids']
    results.append(tokens)

print(results)

In [2]:
print(results[1],results[2])

{'input_ids': [0, 9226, 16, 70, 11, 10753, 19, 5, 7797, 14, 52, 33, 460, 14817, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} {'input_ids': [0, 31653, 47, 475, 338, 842, 16993, 118, 939, 5658, 109, 98, 36811, 479, 5329, 24, 16, 1341, 11, 2396, 19, 5, 2452, 42, 790, 34, 460, 5091, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
print('Writing Data')
train_data = results[: round(len(results) * 0.9)]
test_data = results[round(len(results) * 0.9):]

with open(args.output_train_dir, 'wb') as f:
    pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
    pickle.dump(test_data, f)

Writing Data


In [55]:
from transformers import BartTokenizer, BartModel
import torch

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartModel.from_pretrained("facebook/bart-base")


In [54]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")['input_ids']
outputs = model(inputs)
print(inputs.shape)
print(model)
last_hidden_states = outputs.last_hidden_state

torch.Size([1, 8])
BartModel(
  (shared): Embedding(50265, 768, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): Embedding(50265, 768, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
    (layers): ModuleList(
      (0): BartEncoderLayer(
        (self_attn): BartAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): BartEncoder

In [48]:
print(inputs)
print(last_hidden_states.shape)
print(last_hidden_states)

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 8, 768])
tensor([[[ 2.4118,  2.3732,  1.1981,  ...,  1.8372, -0.1712, -0.7264],
         [-1.4809, -0.5842, -3.3371,  ...,  1.1434, -1.9142,  1.5422],
         [ 0.8170,  1.5384, -1.3417,  ...,  0.5091, -0.9715,  1.4299],
         ...,
         [-1.5440,  0.2834, -1.0513,  ...,  0.7554, -0.3832, -0.0514],
         [ 1.0442, -0.1567,  2.8073,  ...,  1.2079, -1.3359,  0.0742],
         [-0.0903, -0.2080,  0.1134,  ...,  1.1163, -1.0827,  0.3815]]],
       grad_fn=<NativeLayerNormBackward0>)
