In [1]:
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
import csv
from tqdm import tqdm
from transformers import AutoTokenizer

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    s = s.lower()
    return s

def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    ss = []
    for x in cleaned:
        if 30>=len(x.split(' '))>=3 :
            ss.append(x)
    return ss

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    sentences = raw_data.strip().split('\n')
    raw_data_input = [normalize_string(data) for data in sentences]
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--input-data-dir', default='europarl/en', type=str)
parser.add_argument('--output-train-dir', default='SIMCSE_BERT/train_data.pkl', type=str)
parser.add_argument('--output-test-dir', default='SIMCSE_BERT/test_data.pkl', type=str)

    
args = parser.parse_args(args=[])
data_dir = '../data/'

args.input_data_dir = data_dir + args.input_data_dir
args.output_train_dir = data_dir + args.output_train_dir
args.output_test_dir = data_dir + args.output_test_dir

sentences = []
print('Preprocess Raw Text')
for fn in tqdm(os.listdir(args.input_data_dir)):
    if not fn.endswith('.txt'): continue
    process_sentences = process(os.path.join(args.input_data_dir, fn))
    sentences += process_sentences

print('Number of sentences: {}'.format(len(sentences)))


# print('Writing Data')
# train_data = results[: round(len(results) * 0.9)]
# test_data = results[round(len(results) * 0.9):]

# with open(args.output_train_dir, 'wb') as f:
#     pickle.dump(train_data, f)
# with open(args.output_test_dir, 'wb') as f:
#     pickle.dump(test_data, f)

# # spot check
# for i in range(10):
#     print(sentences[i])
#     print(results[i])

Preprocess Raw Text


100%|██████████| 9672/9672 [01:42<00:00, 94.77it/s] 

Number of sentences: 119032





In [9]:
# use BartTokenizer
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

print('Start encoding txt')

tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

print('Writing Data')
train_data = {k: v[:round(len(sentences) * 0.9)] for k, v in tokens.items()}
test_data = {k: v[round(len(sentences) * 0.9):] for k, v in tokens.items()}

with open(args.output_train_dir, 'wb') as f:
    pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
    pickle.dump(test_data, f)

Start encoding txt
Writing Data


In [14]:
for key, value in train_data.items():
    print(f"{key}'s first three items: {value[:]}")
print(train_data['input_ids'].shape)

input_ids's first three items: tensor([[  101, 24501, 24237,  ...,     0,     0,     0],
        [  101,  1996,  2160,  ...,     0,     0,     0],
        [  101,  2008,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  7714,  1045,  ...,     0,     0,     0],
        [  101,  2057,  2031,  ...,     0,     0,     0],
        [  101,  1996, 21371,  ...,     0,     0,     0]])
token_type_ids's first three items: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
attention_mask's first three items: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
torch.Size([107129, 65])
