# Text datasets

In [None]:
#| default_exp text.datasets

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [None]:
#| export

# torch
import torch
from torch.optim import SGD
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as torchtext_vocab
from torch.utils.data import DataLoader, dataset, Dataset, random_split

# L
from lightning import LightningDataModule

# hf
import datasets
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# data 
import pandas as pd
import numpy as np

# ui
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

# param
from omegaconf import DictConfig, OmegaConf
from hydra.utils import instantiate

# python
from typing import Dict, List, Tuple, Optional, Set, Any
from collections import Counter, OrderedDict
from plum import dispatch
import random
import os
from pathlib import Path
import requests
import re

# nimrod
# from nimrod.models.lm import Vocab
from nimrod.utils import set_seed
from nimrod.data.core import DataModule

import logging
logger = logging.getLogger(__name__)


In [None]:
SEED = 42
set_seed(SEED)

## Vocab
Each row is a list of words (sentence). For each row, extract unique character and add to vocabulary. deals with special characters too.

In [None]:
#| export
class Vocab:
    def __init__(self,
                data_path: str | os.PathLike='../data/text/tiny_shakespeare.txt', # path to text data file
                specials=['<pad>', '<unk>', '<bos>', '<eos>'], # encode special characters
                ):

        self.data_path = Path(data_path)
        if not self.data_path.exists():
            self._download_data()

        logger.info(f"Vocab: read text file")
        with open(self.data_path, 'r') as f:
            text = f.read()

        chars = set(text)
        if specials is not None:
            for special in specials:
                chars.add(special)

        self._stoi = {c: i for i, c in enumerate(chars)}
        self._itos = {i: c for i, c in enumerate(chars)}
        self.voc = chars


    
    def _download_data(self):
        logger.info(f"Vocab: download data from url")
        url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
        response = requests.get(url)
        response.raise_for_status()
        
        with open(self.data_path, 'w') as f:
            f.write(response.text)

    @dispatch
    def stoi(self, token:str)->int:
        if token not in self._stoi:
            return self._stoi['<unk>']
        return self._stoi[token]

    @dispatch
    # for list of characters
    def stoi(self, tokens:List[str])->List[int]:
        # TODO: deal with unknown tokens
        return [self._stoi[tok] if tok in self._stoi else self._stoi['<unk>'] for tok in tokens]
    
    # @dispatch #TODO
    # def stoi(self, tokens:List[List[str]])->List[List[int]]:
    #     return [self._stoi[u] for tok in tokens for ]
    # TODO:
    # support torch tensors

    @dispatch    
    def itos(self, index:int)->str:
        return self._itos[index]
    
    @dispatch    
    def itos(self, indices:List[int])->List[str]:
        return [self._itos[index] for index in indices]
        
    def __len__(self):
        return len(self.voc)
    
    @property
    def vocabulary(self)->Set:
        return sorted(set([k for k,v in self._stoi.items()]))


### Usage
read text file into a pandas data framew with each row as a new line

In [None]:
v = Vocab('../data/text/tiny_shakespeare.txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'])
print(v.vocabulary)

In [None]:
# egs where token * is not in vocab
print(v.stoi('*'))
print(v.itos(61))

In [None]:
print(v.vocabulary)
s = v.stoi(["<bos>","h", "e", "l", "l", "o", "*", "<eos>"])
print(s)
print(v.itos(s))

## Tiny shakespeare

### Char Dataset
C.f. https://karpathy.github.io/char-rnn/ text is a long continuous string

In [None]:
#| export 

class CharDataset(Dataset):
    def __init__(self,
                data_path: str | os.PathLike='../data/text/tiny_shakespeare.txt', # path to the data file
                context_length: int=3, # context length
                specials=['<pad>', '<unk>', '<bos>', '<eos>'], # encode special characters
                add_sentence_tokens: bool = True, # add special tokens to the data
                ):
        logger.info(f"CharDataset: init")
        
        # vocab will download data if not found
        self.v = Vocab(data_path=data_path, specials=specials)
        self._vocab_size = len(self.v)
        self.context_length = context_length
        self.special_token_pattern = re.compile(f'({re.escape("<bos>")}|{re.escape("<eos>")})')

        with open(data_path, 'r') as f:
            text = f.read()

        if add_sentence_tokens:
            # Split into sentences (roughly, using periods) and add special tokens
            sentences = [s.strip() for s in text.split('.') if s.strip()]
            sentences = ['<bos>' + s + '<eos>' for s in sentences]
            # Join list of words into single continuous text
            text = " ".join(sentences)

        # text = text.replace("\n", " ")
        tokens = self._tokenizer(text)
        self.data = torch.tensor(self.v.stoi(tokens), dtype=torch.long)

    def __len__(self) -> int:
        return len(self.data)

    def _tokenizer(self, text: str) -> List[str]:
        parts = self.special_token_pattern.split(text)
        tokens = []
        for part in parts:
            if part in ["<bos>", "<eos>"]:
                tokens.append(part)
            else:
                tokens.extend(part)
        return tokens

    @property
    def vocabulary(self)->Set:
        return sorted(set([k for k,v in self.v._stoi.items()]))
    
    @property
    def vocab_size(self)->int:
        return self._vocab_size

    @property
    def vocab_class(self)->Vocab:
        return self.v

    def __getitem__(self, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
        # i = random.randint(0, len(self.data) - (self.context_length + 1))
        max_index = len(self.data) - (self.context_length + 1)
        if i > max_index:
            # wrap around to the beginning if we hit the end
            i = i % (max_index + 1)
        chunk = self.data[i : i + self.context_length + 1]
        x = chunk[:-1]
        y = chunk[1:]
        return x, y

    def to_tokens(self, message: str) -> torch.Tensor:
        return torch.tensor([self.v.stoi(s) for s in message], dtype=torch.long)

    def from_tokens(self, tokens: torch.Tensor) -> str:
        return "".join([self.v.itos(int(i)) for i in tokens])

#### Usage

In [None]:
%%time 
block_size = 3 #context_length
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<pad>', '<unk>', '<bos>', '<eos>'], add_sentence_tokens=True)
# just encode <unk> in case unknown characters are encountered in test set
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<unk>', '<pad>'], add_sentence_tokens=False)
print("vocab size: ", ds.vocab_size)
print(len(ds))
for i in range(2):
    x, y = ds[i]
    print("x:", x,  "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y)[-1])


In [None]:
x,y = ds[0]
print("x:", x,  "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y))
print("vocab size: ", ds.vocab_size)
print("vocabulary: ", ds.vocabulary)

In [None]:
print(len(ds))
t = len(ds)*torch.tensor((0.8, 0.1, 0.1))
lengths = [int(p * len(ds)) for p in (0.8, 0.1, 0.1)]
lengths[-1] = len(ds) - sum(lengths[:-1])
print(lengths)

random_split(ds, lengths)

### Char Data Module

In [None]:
#| export

class CharDataModule(DataModule, LightningDataModule):
    def __init__(self,
            # dataset
            data_path: str | os.PathLike = '../data/text/tiny_shakespeare.txt',
            specials=['<pad>', '<unk>', '<bos>', '<eos>'],
            add_sentence_tokens: bool = False,
            # data module
            train_val_test_split: Tuple[int, int, int] = (0.8, 0.1, 0.1),
            context_size: int = 3,
            batch_size: int = 32,
            num_workers: int = 1,
            pin_memory: bool = False,
            persistent_workers: bool = False,
            random_split: bool = True
            ):

        logger.info(f"CharDataModule: init")

        super().__init__(
            train_val_test_split=train_val_test_split,
            batch_size=batch_size,
            num_workers=num_workers,
            pin_memory=pin_memory,
            persistent_workers=persistent_workers,
            )
        self.save_hyperparameters()
        self.ds: CharDataset = None
    
    def prepare_data(self) -> None:
        pass

    
    def setup(self, stage: Optional[str] = None) -> None:
        logger.info("CharDataModule: setup, split datasets")
        # run in each GPU process. define, split DS, etc.
        self.ds = CharDataset(
            self.hparams.data_path,
            self.hparams.context_size,
            self.hparams.specials,
            self.hparams.add_sentence_tokens,
            )
        if self.hparams.random_split:
            lengths = [int(p * len(self.ds)) for p in self.hparams.train_val_test_split]
            lengths[-1] = len(self.ds) - sum(lengths[:-1])
            self.data_train, self.data_val, self.data_test = random_split(self.ds, lengths)
        else:
            self.data_train, self.data_val, self.data_test = self._sequential_split(self.ds, self.hparams.train_val_test_split)
    
    @property
    def vocab_size(self)->int:
        if self.ds is None:
            raise ValueError("Dataset not initialized")
        return self.ds.vocab_size



#### Usage

In [None]:
dm = CharDataModule(
    data_path="../data/text/tiny_shakespeare.txt",
    add_sentence_tokens=False,
    specials=['<unk>', '<pad>'],
    context_size=3,
    train_val_test_split = (0.8, 0.1, 0.1),
    random_split=False,
    batch_size=64,
    num_workers=0,
    pin_memory=False,
    persistent_workers=False,
    )
dm.setup()

In [None]:
X, Y = dm.data_train[0]
print(dm.ds.from_tokens(X), dm.ds.from_tokens(Y), dm.vocab_size)

In [None]:
len(dm.data_train), len(dm.data_val), len(dm.data_test)
print(dm.data_test[0])
print(len(dm.test_dataloader()))

In [None]:
test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

#### Init from config file

In [None]:
cfg = OmegaConf.load('../config/text/data/tinyshakespeare.yaml')
print(cfg)
dm = instantiate(cfg)
dm.setup()

In [None]:
test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

### Hugging Face
https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt

#### Load text file

In [None]:
dataset = load_dataset("text", data_files="../data/text/tiny_shakespeare.txt") #, split=['train','dev','test'])
print(dataset)
full = dataset['train']

In [None]:
train_test = full.train_test_split(train_size=0.8)
test_valid = train_test['test'].train_test_split(train_size=0.5)
shake = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(shake)

In [None]:
shake['test'][0]

#### Tokenization / Numericalization

##### Tokenize single element

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

print("vocab size: ", len(tokenizer))
print("text row 0: ", shake['test'][0]['text'])
tokens = tokenizer.tokenize(shake['test'][0]['text'])
print("tokens of row 0: ", tokens)

context_length = 10
padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print("context block & padding for lm: ", padded)
# print(padded.keys())
print('decode single input_id: ', tokenizer.decode(849))
print([tokenizer.decode(x) for x in padded['input_ids']])

##### Tokenize whole dataset using map

In [None]:
from omegaconf import OmegaConf

In [None]:
cfg = {
    "context_length": 10,
    "truncation": True,
    "return_length": True,
    "return_overflowing_tokens": True,
}

cfg = OmegaConf.create(cfg)

# tokenizer function called via dataset map
def tokenize_function(examples:List[dict[str,str]], cfg:OmegaConf=cfg) -> dict[str, List[List[int]]]:
    result = tokenizer(
        examples["text"],
        max_length=cfg.context_length,
        truncation=cfg.truncation,
        return_length=cfg.return_length,
        return_overflowing_tokens=cfg.return_overflowing_tokens
        )
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenize_function(shake['test'][0])

In [None]:
shake_toked = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

In [None]:
print(shake_toked['test'][0])
print([tokenizer.decode(x) for x in shake_toked['test'][0]['input_ids']])
print(tokenizer.decode(shake_toked['test'][0]['input_ids']))

In [None]:
for split, dset in shake_toked.items():
    print(split, dset)
    arr_len = np.sum(dset['length'], dtype=np.uint64)


#### Data Collator

In [None]:
print(tokenizer.pad_token, tokenizer.eos_token)


In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([shake_toked["test"]['input_ids'][i] for i in range(5)])
# out = default_data_collator(shake_toked['test']['input_ids'][0])
print(out)
# for key in out:
#     print(f"{key} shape: {out[key].shape}")
# print('inputs: ', out['input_ids'])
# print('labels: ', out['labels'])

# data_collator = DefaultDataCollator(tokenizer)
# out = data_collator([shake_toked["test"][i] for i in range(5)])
# print(out)



In [None]:
def my_collate(examples, block_size: int, **kwargs):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
example = shake['test'][0]
# concatenated_examples = {k: sum(example[k], []) for k in example.keys()}
print([example[k] for k in example.keys()])

In [None]:
out = data_collator([shake_toked['test']['input_ids'][i] for i in range(4)])
print(out)

In [None]:
for i in range(4):
    print([tokenizer.decode(x) for x in out['input_ids'][i]])


#### Dataloader

In [None]:
test_dl = DataLoader(
    shake_toked['test']['input_ids'],
    batch_size=128,
    collate_fn=data_collator,
    num_workers=0,
)

In [None]:
#!head ../data/text/tiny_shakespeare.txt

In [None]:
b = next(iter(test_dl))
print(b['input_ids'].shape)
print(b['input_ids'][1])
print(b['labels'][1])
# for i in range(128):
#     print([tokenizer.decode(x) for x in b['input_ids'][i]])

## Wikitext-2

### Data source from Hugging Face

In [None]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

In [None]:
print(len(dataset), type(dataset), dataset)
print(dataset[100])

In [None]:
dataset[100]


### Data source from torchtext
https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
train_iter = WikiText2(root='../data/text', split='test')
tokenizer = get_tokenizer('basic_english')


In [None]:
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# vocab.set_default_index(vocab['<unk>'])
# len(vocab)

In [None]:
# vocab['the']

In [None]:
# vocab(tokenizer('this is a test'))

In [None]:
# # concatenate all sentences together
def data_process(raw_text_iter) -> torch.Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [None]:
# for idx, i in enumerate(train_iter):
#     print(idx, i)
#     print(vocab(tokenizer(i)))

In [None]:
# train_iter, val_iter, test_iter = WikiText2()
# train_data = data_process(train_iter)
# val_data = data_process(val_iter)
# test_data = data_process(test_iter)

In [None]:
def batchify(data: torch.Tensor, bsz: int) -> torch.Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data

In [None]:
# test_data = batchify(test_data, 10)
# print(test_data)


In [None]:
bptt = 35
def get_batch(source: torch.Tensor, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
# x, y = get_batch(test_data, 0)
# print("x: ", x[:2])
# print("y: ", y[:2])

### Word-based tokenization 

#### torchtext tokenizer

In [None]:
# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# # vocab.set_default_index(vocab['<unk>'])
# tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
# tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
# fn_kwargs={'tokenizer': tokenizer})
# print(tokenized_dataset['train'][88]['tokens'])

#### hugging face tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(dataset[100]['text'])
print(tokens)

### Numericalization

#### torchtext

In [None]:
# vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
# min_freq=3) 
# vocab.insert_token('<unk>', 0)           
# vocab.insert_token('<eos>', 1)            
# vocab.set_default_index(vocab['<unk>'])   
# print(len(vocab))                         
# print(vocab.get_itos()[:10])  

#### hugging face

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
print(tokenizer.decode(ids))


## Hugging Face for LM without intermediary steps
https://huggingface.co/course/chapter7/6?fw=pt

In [None]:
# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)

print(padded)
print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
    print(f"{key} shape: {out[key].shape}")

print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

### Data loader
Concatenate all data into one large string of text and then chunk it into context length chunks
- https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf
- https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s

In [None]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [None]:
# batch_size = 1024
# train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
# valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
# test_data = get_data(tokenized_dataset['test'], vocab, batch_size)


In [None]:
# print(tokenized_dataset.shape)
# print(tokenized_dataset['train']['tokens'][88])

## Language modeling dataset

Basically concatenate all data into one big array of ids and then create block_lengths inputs. shift for corresponding labels.

In [None]:
print(shake)

### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
def tokenize_function(examples:List[dict[str,str]]) -> dict[str, List[List[int]]]:
    result = tokenizer(examples["text"]) #, max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:

tokenized = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

In [None]:
print(tokenized['train'], type(tokenized['train']))


### Sentences concatenation

In [None]:
all = []
for k,v in tokenized.items():
    print(k, v)
    for x in v['input_ids']:
        all += x
    print(len(all))
print(all[:15])

### Batchify

In [None]:
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y

In [None]:
x, y = get_batch(np.array(all), 16, 10)
print(x.shape, y.shape)
print(x[0], y[0])
print(tokenizer.decode(x[0]), tokenizer.decode(y[0]))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()