## Prepare data 

### Load dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("carolina-c4ai/corpus-carolina")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset corpus-carolina (C:/Users/Matheus/.cache/huggingface/datasets/carolina-c4ai___corpus-carolina/carolina/1.2.0/60fe73ac1719891e34135322031692bf177e9323e830d620cf3304f535ee2693)
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]


# Generate Sample

In [2]:
sample = dataset['corpus'].train_test_split(test_size=100)['test']
sample.save_to_disk('../data/carolina_sample_100')

                                                                                           

In [3]:
for text in sample['text']:
    with open('../data/carolina_sample_100.txt', 'a', encoding="utf-8") as f:
        f.write(text + '\n')

### Training Tokenizer

In [5]:
import sentencepiece as spm
import os

tokenizer_path = '../models/tokenizer/deberta_pt_v2/'
tokenizer_name = 'm'

os.makedirs(tokenizer_path, exist_ok=True)

spm.SentencePieceTrainer.train(input='../data/carolina_sample_100.txt', model_prefix=f'../models/tokenizer/deberta_pt_v2/{tokenizer_name}', vocab_size=1000, user_defined_symbols=['foo', 'bar'])

### Test Tokenizer

In [6]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta

In [11]:
p,t=deberta.load_vocab(vocab_path=tokenizer_path, vocab_type='spm', pretrained_id='deberta-v3-base')
tokenizer=deberta.tokenizers[t](p)

In [14]:
tokenizer.tokenize('Treinamento de um pequeno modelo de linguagem pt-br.')

['▁Tre',
 'in',
 'amento',
 '▁de',
 '▁um',
 '▁p',
 'equen',
 'o',
 '▁model',
 'o',
 '▁de',
 '▁lingua',
 'gem',
 '▁p',
 't',
 '-',
 'br',
 '.']

### prepare_data deberta

In [None]:
# coding: utf-8

import argparse
from tqdm import tqdm

def tokenize_data(input, output=None, max_seq_length=512):
  p,t=deberta.load_vocab(vocab_path=None, vocab_type='spm', pretrained_id='deberta-v3-base')
  tokenizer=deberta.tokenizers[t](p)
  if output is None:
    output=input + '.spm'
  all_tokens = []
  with open(input, encoding = 'utf-8') as fs:
    for l in tqdm(fs, ncols=80, desc='Loading'):
      if len(l) > 0:
        tokens = tokenizer.tokenize(l)
      else:
        tokens = []
      all_tokens.extend(tokens)

  print(f'Loaded {len(all_tokens)} tokens from {input}')
  lines = 0
  with open(output, 'w', encoding = 'utf-8') as wfs:
    idx = 0
    while idx < len(all_tokens):
      wfs.write(' '.join(all_tokens[idx:idx+max_seq_length-2]) + '\n')
      idx += (max_seq_length - 2)
      lines += 1

  print(f'Saved {lines} lines to {output}')

# parser = argparse.ArgumentParser()
# parser.add_argument('-i', '--input', required=True, help='The input data path')
# parser.add_argument('-o', '--output', default=None, help='The output data path')
# parser.add_argument('--max_seq_length', type=int, default=512, help='Maxium sequence length of inputs')
# args = parser.parse_args()
# tokenize_data(args.input, args.output, args.max_seq_length)
