In [1]:
from huggingface_hub import hf_hub_download
from datasets import load_dataset, load_from_disk, Dataset
from pathlib import Path
import re
from sklearn.model_selection import train_test_split
import random
import string
import torch
from tokenizers import ByteLevelBPETokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BASE_DATA_DIR = Path("./data")
BASE_DATA_DIR.mkdir(parents=True, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


## Download dataset
Download BUT-LCC dataset. 

### Format
```json
{
  "id": unique identifier, 		
  "part": original source, 
  "title": source document title, 
  "text": the context,
  "ugly": bool,
  "ugly_score": float
}
```

In [None]:
REPO_ID = "BUT-FIT/BUT-LCC"
FILE_NAME = "train_0.jsonl.gz"

dataset_path = hf_hub_download(repo_id=REPO_ID, filename=FILE_NAME, repo_type="dataset")
dataset = load_dataset('json', data_files=dataset_path, split='train')

## Filter and save dataset 
- Filter data only from czech wikipedia.
- Saving dataset (checkpoint) for speeding things up.

In [None]:
dataset = dataset.filter(lambda x: x["part"] == "cswiki-20230101")
dataset.save_to_disk(BASE_DATA_DIR / "cs-wiki")

## Data preparation

In [4]:
dataset = load_from_disk(BASE_DATA_DIR / "cs-wiki")
texts = dataset['text']

### Extract sentences
- clear sentences of any non-alphabetic characters

In [5]:
# Extract sentences from text
sentences = []
for i, text in enumerate(texts):
    sentences.extend(text.split('.'))
    if i == 1000:
        break

# Clean text
def clean_text(text):
    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-zA-Zá-žÁ-Ž ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()  # Convert to lowercase

cleaned_sentences = [clean_text(sentence) for sentence in sentences]

### Introduce errors
- add errors into the sentences

In [6]:
# Function to introduce errors in a word
def introduce_errors(word):
    if len(word) > 1:
        index = random.randint(0, len(word) - 1)
        return word[:index] + random.choice(string.ascii_lowercase) + word[index+1:]
    else:
        return word

# Function to generate error-introduced sentences
def generate_error_sentences(sentences, error_rate=0.1):
    error_sentences = []
    for sentence in sentences:
        words = sentence.split()
        error_words = [introduce_errors(word) if random.random() < error_rate else word for word in words]
        error_sentence = ' '.join(error_words)
        error_sentences.append(error_sentence)
    return error_sentences

# Example usage:
error_rate = 0.1  # Adjust error rate as needed
error_introduced_sentences = generate_error_sentences(cleaned_sentences, error_rate)


# Split data into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(error_introduced_sentences, cleaned_sentences, test_size=0.2, random_state=42)

# Convert lists to dictionaries
train_data = {"sentence": x_train, "labels": y_train}
val_data = {"sentence": x_val, "labels": y_val}

# Create Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

### Train tokenizer
We need to convert our text into numerical data to feed into the model. We need to make sure that for every input character in data there is corresponding numerical value.

In [None]:
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train_from_iterator(texts, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save_model(".", "example")

### Tokenize data
- load saved tokenizer

In [9]:
tokenizer = ByteLevelBPETokenizer(
    "example-vocab.json",
    "example-merges.txt"
)


def tokenize(example: dict[str, str]):
    encodings = tokenizer.encode_batch(example["sentence"])

    print(encodings)

tokenized_dataset = train_dataset.map(tokenize, batched=True)

KeyboardInterrupt: 