In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
#import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [19]:
model_type = 'SZTAKI-HLT/hubert-base-cc' # bert-base-multilingual-cased, SZTAKI-HLT/hubert-base-cc


data_path = f"C:/Users/pozmanreka/PycharmProjects/test/venv/src/neural_punctuator/data/"
os.makedirs(data_path + model_type, exist_ok=True)
file_path = data_path + "hu.txt"

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.readlines()
    
len(text), text[1]

(7023366, 'Fordította:\n')

In [3]:
from collections import OrderedDict

text = list(OrderedDict.fromkeys(text))
len(text)

3288415

In [4]:
from sklearn.utils import shuffle

text = shuffle(text, random_state=0)

text[1]

'Eleget láttam!\n'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [6]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('_', ' ')
    text = text.replace('\\', ' per ')

    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('(', ', ')
    text = text.replace(')', ', ')
    
    text = text.replace('...', '.')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')
    
    text = re.sub(r'\s-\s', ' , ', text)

    text = re.sub(r'--\s?--', '', text)   
    
    text = re.sub(r',\s?,', ', ', text)
    text = re.sub(r'\.[\s+.]+', '. ', text)
    
    text = re.sub(r',\s?,', ',', text)
    
    remove_space_before = [',', '?', '.', '!', '\n']
    for c in remove_space_before:
        text = text.replace(' ' + c, c)
        
    text = re.sub(r'(\w)(\n)', r'\1.\2', text)
    
    text = re.sub(r',\s+[\.,\s]+', ', ', text)
    text = re.sub(r'\.\s+[\.,\s]+', '. ', text)
    text = re.sub(r'\.[\.,]+', '.', text)
    text = re.sub(r'\,[\.,]+', ',', text)
    text = re.sub(r'\,[\.,]+', ',', text)
    text = text.lstrip('.,?')
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip().lower()

In [7]:
text = [clean_text(t) for t in text]
text = [t for t in text if len(t) > 0]
len(text), text[1]

(3288030, 'eleglátta')

In [8]:
# ' '.join(text[train_n:train_n+valid_n])

In [9]:
text[926886]

'gyűössz'

In [10]:
for i, t in enumerate(text):
    if 'brazil' in t:
        print(i, t)

94308 ebrazillse
582550 hotetsza 38,brazil?
599521 brazili
798321 a braziloknál ovzic
926804 brazili[a-za-z].
1047346 noéppbraziliábvoltués ő.
1881268 brazilveszteolszokkszembe
2466235 a brazilróapssebészntartmagát?
2632471 mindbizonnynéltúl átkelébraziliából
2746731 a hidfalklaáramlitalálkoza melbrazilla
2822191 venezuelábabraziliábabolíviábajamaicábaés a gaboköztársaságba
2986246 a brazilifrontellenállás vezetői hatalmkudarcszenvedtemivelfogtés kivégeztszámmagranvezetőt a magroszászlóaljbó
3031531 ezbraziliesőerdő,nesze
3205420 istenea brazila legszeembere
3210125 a brazilítapí


In [11]:
tokenizer.encode(".?,")

[2, 4575, 8308, 3576, 3]

In [12]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 4575, '?': 8308, ',': 3576}

In [13]:
target_ids = list(target_token2id.values())
target_ids

[4575, 8308, 3576]

In [14]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in tqdm(words):
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    return encoded_words, targets

In [15]:
train_n = 210_000
valid_n = 20_000
test_n = 8_000

train_text = ' '.join(text[:train_n])
valid_text = ' '.join(text[train_n:train_n+valid_n])
test_text = ' '.join(text[train_n+valid_n:test_n+valid_n+train_n])

len(train_text.split(' ')), len(valid_text.split(' ')), len(test_text.split(' '))

(397846, 37823, 15257)

In [16]:
len(text) - train_n - valid_n

3058030

In [17]:
train_text.split(' ')[154349-5:154349+5]

['hangoda',
 'lefordította',
 'vésete',
 'ritk',
 'ntudhasználerőime',
 'jobbmivelem?',
 'nöltmeszóvsegíthrajt',
 'kepusztítan',
 'és',
 'mcsinál']

In [18]:
train_tokens, train_targets = create_target(train_text)
valid_tokens, valid_targets = create_target(valid_text)
test_tokens, test_targets = create_target(test_text)

  0%|          | 0/397846 [00:00<?, ?it/s]

AssertionError: 

In [None]:
# For backward campatibility
train_tokens, train_targets = [train_tokens], [train_targets]
valid_tokens, valid_targets = [valid_tokens], [valid_targets]
test_tokens, test_targets = [test_tokens], [test_targets]

In [None]:
os.makedirs(data_path + model_type, exist_ok=True)
with open(data_path + f'{model_type}/train_data.pkl', 'wb') as f:
    pickle.dump((train_tokens, train_targets), f)
with open(data_path + f'{model_type}/valid_data.pkl', 'wb') as f:
    pickle.dump((valid_tokens, valid_targets), f)
with open(data_path + f'{model_type}/test_data.pkl', 'wb') as f:
    pickle.dump((test_tokens, test_targets), f)

In [None]:
from collections import Counter

for ds in (train_targets, valid_targets, test_targets):
    c = Counter((t for targets in ds for t in targets))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

In [None]:
e = []
i = 0

raw_words = valid_text.split(' ')

for te, ta in zip(valid_tokens[0], valid_targets[0]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1

In [None]:
print(tokenizer.decode(valid_tokens[0]))

In [None]:
' '.join(text[train_n:train_n+valid_n])