## Prepare data 

### Load dataset

In [1]:
from datasets import load_dataset, load_from_disk

#dataset_carolina = load_dataset("carolina-c4ai/corpus-carolina")
dataset_brwac = load_from_disk("../data/brwac_dataset")

  from .autonotebook import tqdm as notebook_tqdm


### Generating tokenized file directly from dataset experiment

In [15]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

from tqdm import tqdm

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

In [4]:
for text in tqdm(dataset_carolina['corpus']['text']):
    with open('../data/carolina_brwac/carolina_brwac_tokenized.txt', 'a', encoding="utf-8") as f:
        f.write(' '.join(tokenizer.tokenize(text)) + '\n')

100%|██████████| 2107045/2107045 [27:16<00:00, 1287.40it/s] 


In [18]:
for example in tqdm(dataset_brwac['train']):
    with open('../data/carolina_brwac/carolina_brwac_tokenized.txt', 'a', encoding="utf-8") as f:
        f.write(' '.join(tokenizer.tokenize(' '.join(sum(example['text']['paragraphs'], [])))) + '\n')

100%|██████████| 3530796/3530796 [1:29:53<00:00, 654.58it/s]  


# Generate Dataset

In [None]:
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
for text in tqdm(dataset_carolina['corpus']['text']):
    with open('../data/corpus-carolina/carolina.txt', 'a', encoding="utf-8") as f:
        f.write(text + '\n')

In [None]:
for example in tqdm(dataset_brwac['train']):
    with open('../data/brwac_dataset/brwac.txt', 'a', encoding="utf-8") as f:
        f.write("\n".join(sum(example['text']['paragraphs'], [])))

In [None]:
for text in tqdm(dataset_carolina['corpus']['text']):
    with open('../data/carolina_brwac/carolina_brwac.txt', 'a', encoding="utf-8") as f:
        f.write(text + '\n')

for example in tqdm(dataset_brwac['train']):
    with open('../data/carolina_brwac/carolina_brwac.txt', 'a', encoding="utf-8") as f:
        f.write("\n".join(sum(example['text']['paragraphs'], [])))

### Training Tokenizer

In [None]:
import sentencepiece as spm
import os

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
#os.makedirs(tokenizer_path, exist_ok=True)
tokenizer_name = 'm'

#os.makedirs(tokenizer_path, exist_ok=True)

#spm.SentencePieceTrainer.train(input='../data/corpus-carolina/carolina.txt', model_prefix=f'{tokenizer_path}/{tokenizer_name}', vocab_size=50265)

In [None]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta
from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

In [None]:
tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
# p,t=deberta.load_vocab(vocab_path=tokenizer_path, vocab_type='spm', pretrained_id='deberta-v3-base')
# tokenizer=deberta.tokenizers[t](p)
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

### Split Train/Test

In [3]:
%%time
_input_file = '../data/carolina_brwac/carolina_brwac_tokenized.txt'
_output_path = '../data/carolina_brwac/training_data/'

def get_total_lines(input_file):
    with open(input_file, 'r', encoding='utf-8') as rfs:
        total_lines = sum(1 for _ in rfs)
    return total_lines

total_lines = get_total_lines(_input_file)

CPU times: total: 2min 20s
Wall time: 2min 21s


In [4]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta
from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

tqdm.pandas()

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

max_seq_length = 512

os.makedirs(_output_path, exist_ok=True)

def generate_train_test_files(lines_tokenized_path, output_path,  total_lines, max_seq_length=512):
    _train_idx, _test_idx = train_test_split(np.arange(0, total_lines), test_size=0.05, random_state=42)
    train_idx, test_idx = {i: True for i in _train_idx}, {i: True for i in _test_idx}
    
    train_tokens, test_tokens = [], []
    train_counter, test_counter = 0, 0
    cnt = 0

    chunk_size = 10*(max_seq_length-2)

    with open(lines_tokenized_path, 'r', encoding='utf-8') as rfs:
        
        progress_bar = tqdm(rfs, total=total_lines, desc='Generating train and test files')

        for line in progress_bar:

            line_tokens = line[:-1].split(' ')

            if cnt in train_idx:
                
                train_tokens.extend(line_tokens)
                train_counter += len(line_tokens)

                if train_counter > chunk_size:
                    while train_counter > max_seq_length-2:
                        with open(output_path + 'train.txt', 'a', encoding='utf-8') as wfs:
                            wfs.write(' '.join(train_tokens[:max_seq_length-2]) + '\n')
                        train_tokens = train_tokens[max_seq_length-2:]
                        train_counter -= (max_seq_length-2)
            else:

                test_tokens.extend(line_tokens)
                test_counter += len(line_tokens)

                if test_counter > chunk_size:
                    while test_counter > max_seq_length-2:
                        with open(output_path + 'test.txt', 'a', encoding='utf-8') as wfs:
                            wfs.write(' '.join(test_tokens[:max_seq_length-2]) + '\n')
                        test_tokens = test_tokens[max_seq_length-2:]
                        test_counter -= (max_seq_length-2)

            cnt+=1

    while train_counter > max_seq_length-2:
        with open(output_path + 'train.txt', 'a', encoding='utf-8') as wfs:
            wfs.write(' '.join(train_tokens[:max_seq_length-2]) + '\n')
        train_tokens = train_tokens[max_seq_length-2:]
        train_counter -= (max_seq_length-2)

    while test_counter > max_seq_length-2:
        with open(output_path + 'test.txt', 'a', encoding='utf-8') as wfs:
            wfs.write(' '.join(test_tokens[:max_seq_length-2]) + '\n')
        test_tokens = test_tokens[max_seq_length-2:]
        test_counter -= (max_seq_length-2)
        
generate_train_test_files(_input_file, _output_path, total_lines, max_seq_length)

Generating train and test files:   3%|▎         | 170753/5637841 [02:17<1:23:17, 1094.06it/s]

In [None]:
# # count tokens by each line from _train_test_path
# with open(_train_test_path + 'train.txt', 'r', encoding='utf-8') as rfs:
#     for line in rfs:
#         print(f'Train tokens: {len(line.strip().split(" "))}')

### OLD OPTION

In [None]:
from tqdm import tqdm

_input = '../data/carolina_brwac/carolina_brwac.txt'
_output_train = '../data/carolina_brwac/carolina_brwac_TRAIN.txt'
_output_test = '../data/carolina_brwac/carolina_brwac_TEST.txt'
max_seq_length=512

from sklearn.model_selection import train_test_split

with open(_input, 'r', encoding='utf-8') as rfs:
    train, test = train_test_split(rfs.readlines(), test_size=0.05)

In [None]:
from sklearn.model_selection import train_test_split

with open(_input, 'r', encoding='utf-8') as rfs:
    train, test = train_test_split(rfs.readlines(), test_size=0.05)

In [None]:
# write train data
with open(_output_train, 'w', encoding='utf-8') as wfs:
    for line in tqdm(train, desc='Processing'):
        wfs.write(f"{line}\n")


In [None]:
# write train data
with open(_output_test, 'w', encoding='utf-8') as wfs:
    for line in tqdm(test, desc='Processing'):
        wfs.write(f"{line}\n")

In [None]:
# write train data
with open(_output_train, 'w', encoding='utf-8') as wfs:
    for line in tqdm(train, desc='Processing'):
        wfs.write(f"{line}\n")

In [None]:
def tokenize_data(data, tokenizer):
    tokenized_data = []
    for text in tqdm(data['text'], desc='Tokenizing'):
        tokenized_data.extend(tokenizer.tokenize(text))    
    return tokenized_data


def write_tokenized_data(tokenized_data, output_file, max_seq_length=512):
    lines = 0
    with open(output_file, 'w', encoding = 'utf-8') as wfs:
        idx = 0
        while idx < len(tokenized_data):
            wfs.write(' '.join(tokenized_data[idx:idx+max_seq_length-2]) + '\n')
            idx += (max_seq_length - 2)
            lines += 1
    print(f'Wrote {lines} lines to {output_file}')

In [None]:
%%time

train_tokenized = tokenize_data(train, tokenizer)
test_tokenized = tokenize_data(test, tokenizer)
# valid_tokenized = tokenize_data(valid, tokenizer)

In [None]:
%%time

write_tokenized_data(train_tokenized, '../data/carolina_brwac/full_data/train.txt')
write_tokenized_data(test_tokenized, '../data/carolina_brwac/full_data/test.txt')
# write_tokenized_data(valid_tokenized, '../data/carolina_brwac/full_data/valid.txt')
# Write test as valid
write_tokenized_data(test_tokenized, '../data/carolina_brwac/full_data/valid.txt')

### Test Tokenizer

### DEBUG

In [1]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta
from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

In [4]:
from tqdm import tqdm

checkpoint = 101539791
total_line = 143946923

_output_path = '../data/carolina_brwac/full_data/lines_tokenized_p2/'

with open('../data/carolina_brwac/carolina_brwac.txt', 'r', encoding='utf-8') as rfs:
    
    line_count = 0
    for _ in range(checkpoint):
        rfs.readline()
        line_count +=1
    
    progress_bar = tqdm(rfs, total=total_line-checkpoint, desc='Processing Data')

    for line in progress_bar:
        _output_file = _output_path + f'{line_count}.txt'
        with open(_output_file, 'w+', encoding='utf-8') as wfs:
            tokens = tokenizer.tokenize(line.strip())
            print(' '.join(tokens), file=wfs, flush=True)
            wfs.close()
        line_count += 1

Processing Data:   3%|▎         | 1190600/42407132 [34:51<20:06:55, 569.17it/s] 


KeyboardInterrupt: 

### Sagemaker

In [None]:
import boto3

import sys
sys.path.append('../DeBERTa')

from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

from tqdm import tqdm

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')


bucket = 'letrusnlp'
prefix = 'letrus-lab/nlp_research/carolina_brwac/'
filename = 'carolina_brwac.txt'

s3 = boto3.client('s3')

response = s3.get_object(Bucket=bucket, Key=prefix+filename)

cnt = 0
total_lines = 143946923

progress_bar = tqdm(total=total_lines, desc='Processing Data')

for line in response['Body'].iter_lines():
    output_path = f'{prefix}/lines_tokenized/{cnt}.txt'
    tokens = tokenizer.tokenize(line.decode('utf-8').strip())
    s3.put_object(Bucket=bucket, Key=output_path, Body=' '.join(tokens))
    cnt += 1
    progress_bar.update(1)