# Text datasets

In [None]:
#| default_exp text.datasets

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [None]:
#| export

# torch
import torch

from torch.optim import SGD
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, dataset, Dataset, random_split

# pl
from lightning import LightningDataModule, seed_everything


# hf
import datasets
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, DefaultDataCollator, default_data_collator

# data 
import pandas as pd
import numpy as np

# ui
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

# param
from omegaconf import DictConfig, OmegaConf
from hydra.utils import instantiate

# python
from typing import Dict, List, Tuple, Optional, Set, Union
from collections import Counter, OrderedDict

from plum import dispatch
import urllib
import random
import os
import math

# nimrod
from nimrod.data.utils import DataModule, split_train_valid_test

# conf
from hydra.utils import instantiate
from omegaconf import OmegaConf

from typing import Dict, List, Tuple, Optional, Set, Any

# nimrod
# from nimrod.models.lm import Vocab
SEED = 42
seed_everything(SEED)


Seed set to 42


42

## Vocab
Each row is a list of words (sentence). For each row, extract unique character and add to vocabulary. deals with special characters too.

In [None]:
#| export
class Vocab:
    def __init__(self,
                data_path: str | os.PathLike, # path to text data file
                specials=['<pad>', '<unk>', '<bos>', '<eos>'], # encode special characters
                add_sentence_tokens=True, # add <bos> and <eos> tokens to each sentence
                ):
        # read data
        df = pd.read_fwf(data_path, header=None, names=['text'])
        if add_sentence_tokens:
            df.loc[:, 'text'] = df['text'].apply(lambda x: ['<bos>'] + list(x)+ ['<eos>'])
        data = list(df['text'])
        # count individual tokens
        c = Counter()
        for row in data:
            for token in row:
                c.update(token)
        ordered_tuple = sorted(c.items(), key=lambda x:x[1], reverse=True)
        dict = OrderedDict(ordered_tuple)        
        # leverage torchtext vocab
        self.voc = vocab(dict, specials=specials)
        if '<unk>' in specials:
            self.voc.set_default_index(self.voc['<unk>'])
        else:
            self.voc.set_default_index(-1)
        self._stoi = self.voc.get_stoi()
        self._itos = self.voc.get_itos()

    @dispatch
    def stoi(self, token:str)->int:
        if len(token) > 1 and token not in ['<pad>', '<unk>', '<bos>', '<eos>']:
            raise ValueError("input should be a token or list of tokens")
        return self._stoi[token]

    @dispatch
    # for list of characters
    def stoi(self, tokens:List[str])->List[int]:
        return [self._stoi[tok] for tok in tokens]
    
    # @dispatch #TODO
    # def stoi(self, tokens:List[List[str]])->List[List[int]]:
    #     return [self._stoi[u] for tok in tokens for ]
    # TODO:
    # support torch tensors

    @dispatch    
    def itos(self, index:int)->str:
        return self._itos[index]
    
    @dispatch    
    def itos(self, indices:List[int])->List[str]:
        return [self._itos[index] for index in indices]
        
    def __len__(self):
        return len(self.voc)
    
    @property
    def vocabulary(self)->Set:
        return sorted(set([k for k,v in self._stoi.items()]))


### Usage
read text file into a pandas data framew with each row as a new line

In [None]:
v = Vocab('../data/text/tiny_shakespeare.txt')

In [None]:
print(v.vocabulary)
s = v.stoi(["h", "e", "l", "l", "o"])
print(s)
print(v.itos(s))
v.stoi(['T', 'h', 'e', ' '])

[' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<', '<bos>', '<eos>', '<pad>', '<unk>', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[12, 6, 17, 17, 5]
['h', 'e', 'l', 'l', 'o']


[34, 12, 6, 4]

## Tiny shakespeare

### Char Dataset
After Karpathy chatGPT tutorial

In [None]:
# ["a", "b", "c"]
# a, b | c


def test(row:List[str], n_context:int):
    i = 0
    maxi = len(row) - n_context
    print(maxi)
    while i < maxi:
        print(row[i:i+n_context], row[i+n_context])
        i += 1



test(["a", "b", "c", "d"], 2)



2
['a', 'b'] c
['b', 'c'] d


In [None]:
#| export 

class CharDataset(Dataset):
    def __init__(self,
                data_path: str | os.PathLike, # path to the data file
                context_length: int, # context length
                vocab: Vocab, # vocab object
                add_sentence_tokens: bool = True, # add special tokens to the data
                verbose: bool = False, # print info
                ):
        
        df = pd.read_fwf(data_path, header=None, names=['text'])

        if add_sentence_tokens:
            df['text'] = df['text'].apply(lambda x: ['<bos>'] + list(x)+ ['<eos>'])

        data = list(df['text'])
        self.data = []
        # flatten list of list of chars into one big list of chars with <bos> and <eos>
        for row in data:
            self.data.extend(row)

        self.context_length = context_length
        self.v = vocab
        self.vocab_size = len(self.v)
        
    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        i = random.randint(0, len(self.data) - (self.context_length + 1))
        chunk = self.data[i : i + self.context_length + 1]
        dix = self.v.stoi(chunk)
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

    def to_tokens(self, message: str) -> torch.Tensor:
        return torch.tensor([self.v.stoi(s) for s in message], dtype=torch.long)

    def from_tokens(self, tokens: torch.Tensor) -> str:
        return "".join([self.v.itos(int(i)) for i in tokens])

#### Usage

In [None]:
# with urllib.request.urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt") as f:
#     text = f.read().decode("utf-8")

In [None]:
block_size = 40 #context_length
ds = CharDataset('../data/text/tiny_shakespeare.txt', block_size, v)
x,y = ds[2]
print("x:",  ds.from_tokens(x), "\ny:", ds.from_tokens(y))

x:  me in my course.<eos><bos>Why I descend into th 
y: me in my course.<eos><bos>Why I descend into thi


In [None]:
print(len(ds))
t = len(ds)*torch.tensor((0.8, 0.1, 0.1))
lengths = [int(p * len(ds)) for p in (0.8, 0.1, 0.1)]
lengths[-1] = len(ds) - sum(lengths[:-1])
print(lengths)

random_split(ds, lengths)

1140888
[912710, 114088, 114090]


[<torch.utils.data.dataset.Subset>,
 <torch.utils.data.dataset.Subset>,
 <torch.utils.data.dataset.Subset>]

### Char Data Module

In [None]:
#| export

class CharDataModule(LightningDataModule):
    def __init__(
            self,
            data_path: str | os.PathLike,
            train_val_test_split: Tuple[int, int, int] = (0.8, 0.1, 0.1),
            context_size: int = 3,
            batch_size: int = 32,
            num_workers: int = 0,
            pin_memory: bool = False,
            persistent_workers: bool = False,
            ):
        
        super().__init__()
        self.save_hyperparameters()
        self.train_ds: Optional[Dataset] = None
        self.val_ds: Optional[Dataset] = None
        self.test_ds: Optional[Dataset] = None
        self.ds: Optional[Dataset] = None
        # we extract vocab from the full char dataset
        # TODO: add option to pass in vocab
        self.v = Vocab(data_path)

        if sum(train_val_test_split) != 1.0:
            raise ValueError("train_val_test_split must sum to 1.0")

    def prepare_data(self) -> None:
        # run in main process. download, tokenize, etc.
        pass
    
    def setup(self, stage: Optional[str] = None) -> None:
        # run in each GPU process. define, split DS, etc.
        self.ds = CharDataset(self.hparams.data_path, self.hparams.context_size, self.v)
        lengths = [int(p * len(self.ds)) for p in self.hparams.train_val_test_split]
        lengths[-1] = len(self.ds) - sum(lengths[:-1])

        self.train_ds, self.val_ds, self.test_ds = random_split(self.ds, lengths)

    
    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_ds,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            pin_memory=self.hparams.pin_memory,
            shuffle=True,
            persistent_workers=self.hparams.persistent_workers
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.val_ds,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            pin_memory=self.hparams.pin_memory,
            shuffle=False,
            persistent_workers=self.hparams.persistent_workers
        )
 
    def test_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_ds,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            pin_memory=self.hparams.pin_memory,
            shuffle=False,
            persistent_workers=self.hparams.persistent_workers
        )
 

    def teardown(self, stage: Optional[str] = None) -> None:
        return super().teardown(stage)

    def state_dict(self) -> Dict[str, Any]:
        return super().state_dict()

    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        return super().load_state_dict(state_dict)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return super().forward(x)

    def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor:
        return super().training_step(batch, batch_idx)
    

#### Usage

In [None]:
dm = CharDataModule(
    "../data/text/tiny_shakespeare.txt",
    batch_size=64,
    context_size=20,
    num_workers=0,
    pin_memory=False,
    )
dm.setup()

In [None]:
test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

X (B,T):  torch.Size([64, 20]) X:  tensor([11, 17, 13,  6, 11, 18, 21, 49,  4,  5, 13,  4, 15,  7,  4, 15,  8,  4,
        24,  6]) chars:  already? or is it fe
Y (B):  torch.Size([64, 20]) Y:  tensor([17, 13,  6, 11, 18, 21, 49,  4,  5, 13,  4, 15,  7,  4, 15,  8,  4, 24,
         6, 11]) chars:  lready? or is it fea


#### Init from config file

In [None]:
# cfg = OmegaConf.load('../config/text/data/tinyshakespeare.yaml')
# print(cfg)
# dm = instantiate(cfg)
# dm.setup()

In [None]:
test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

X (B,T):  torch.Size([64, 20]) X:  tensor([12,  6,  4, 28,  5,  5, 13,  4, 16,  6,  6,  8, 17,  6, 22,  4,  8, 12,
        11,  8]) chars:  he poor beetle, that
Y (B):  torch.Size([64, 20]) Y:  tensor([ 6,  4, 28,  5,  5, 13,  4, 16,  6,  6,  8, 17,  6, 22,  4,  8, 12, 11,
         8,  4]) chars:  e poor beetle, that 


### Hugging Face
https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt

#### Load text file

In [None]:
dataset = load_dataset("text", data_files="../data/text/tiny_shakespeare.txt") #, split=['train','dev','test'])
print(dataset)
full = dataset['train']

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 40000
    })
})


In [None]:
train_test = full.train_test_split(train_size=0.8)
test_valid = train_test['test'].train_test_split(train_size=0.5)
shake = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(shake)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})


In [None]:
shake['test'][0]

{'text': "There's no more to be said, but he is banish'd,"}

#### Tokenization / Numericalization

##### Tokenize single element

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

print("vocab size: ", len(tokenizer))
print("text row 0: ", shake['test'][0]['text'])
tokens = tokenizer.tokenize(shake['test'][0]['text'])
print("tokens of row 0: ", tokens)

context_length = 10
padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print("context block & padding for lm: ", padded)
# print(padded.keys())
print('decode single input_id: ', tokenizer.decode(849))
print([tokenizer.decode(x) for x in padded['input_ids']])

vocab size:  50257
text row 0:  There's no more to be said, but he is banish'd,
tokens of row 0:  ['There', "'s", 'Ġno', 'Ġmore', 'Ġto', 'Ġbe', 'Ġsaid', ',', 'Ġbut', 'Ġhe', 'Ġis', 'Ġban', 'ish', "'d", ',']
context block & padding for lm:  {'input_ids': [[1858, 338, 645, 517, 284, 307, 531, 11, 475, 339], [318, 3958, 680, 1549, 11]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], 'length': [10, 5], 'overflow_to_sample_mapping': [0, 0]}
decode single input_id:  oth
["There's no more to be said, but he", " is banish'd,"]




##### Tokenize whole dataset using map

In [None]:
from omegaconf import OmegaConf

In [None]:
cfg = {
    "context_length": 10,
    "truncation": True,
    "return_length": True,
    "return_overflowing_tokens": True,
}

cfg = OmegaConf.create(cfg)

# tokenizer function called via dataset map
def tokenize_function(examples:List[dict[str,str]], cfg:OmegaConf=cfg) -> dict[str, List[List[int]]]:
    result = tokenizer(
        examples["text"],
        max_length=cfg.context_length,
        truncation=cfg.truncation,
        return_length=cfg.return_length,
        return_overflowing_tokens=cfg.return_overflowing_tokens
        )
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenize_function(shake['test'][0])

{'input_ids': [[1858, 338, 645, 517, 284, 307, 531, 11, 475, 339], [318, 3958, 680, 1549, 11]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], 'length': [10, 5], 'overflow_to_sample_mapping': [0, 0], 'word_ids': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 11, 12, 13]]}

In [None]:
shake_toked = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

In [None]:
print(shake_toked['test'][0])
print([tokenizer.decode(x) for x in shake_toked['test'][0]['input_ids']])
print(tokenizer.decode(shake_toked['test'][0]['input_ids']))

{'input_ids': [1858, 338, 645, 517, 284, 307, 531, 11, 475, 339], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': 10, 'overflow_to_sample_mapping': 0, 'word_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
['There', "'s", ' no', ' more', ' to', ' be', ' said', ',', ' but', ' he']
There's no more to be said, but he


In [None]:
for split, dset in shake_toked.items():
    print(split, dset)
    arr_len = np.sum(dset['length'], dtype=np.uint64)


train Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 42263
})
test Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5276
})
valid Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5295
})


#### Data Collator

In [None]:
print(tokenizer.pad_token, tokenizer.eos_token)


None <|endoftext|>


In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([shake_toked["test"]['input_ids'][i] for i in range(5)])
# out = default_data_collator(shake_toked['test']['input_ids'][0])
print(out)
# for key in out:
#     print(f"{key} shape: {out[key].shape}")
# print('inputs: ', out['input_ids'])
# print('labels: ', out['labels'])

# data_collator = DefaultDataCollator(tokenizer)
# out = data_collator([shake_toked["test"][i] for i in range(5)])
# print(out)



{'input_ids': tensor([[ 1858,   338,   645,   517,   284,   307,   531,    11,   475,   339],
        [  318,  3958,   680,  1549,    11, 50256, 50256, 50256, 50256, 50256],
        [   40,  8406,   606,   326,   750,  1842,   511,  1499,   338,   922],
        [   45, 12321,    11, 35695,   502,   284, 11906, 10846,   290, 37769],
        [   13,   314, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'labels': tensor([[ 1858,   338,   645,   517,   284,   307,   531,    11,   475,   339],
        [  318,  3958,   680,  1549,    11,  -100,  -100,  -100,  -100,  -100],
        [   40,  8406,   606,   326,   750,  1842,   511,  1499,   338,   922],
        [   45, 12321,    11, 35695,   502,   284, 11906, 10846,   290, 37769],
        [   13,   314,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]])}


In [None]:
def my_collate(examples, block_size: int, **kwargs):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
example = shake['test'][0]
# concatenated_examples = {k: sum(example[k], []) for k in example.keys()}
print([example[k] for k in example.keys()])

["There's no more to be said, but he is banish'd,"]


In [None]:
out = data_collator([shake_toked['test']['input_ids'][i] for i in range(4)])
print(out)

{'input_ids': tensor([[ 1858,   338,   645,   517,   284,   307,   531,    11,   475,   339],
        [  318,  3958,   680,  1549,    11, 50256, 50256, 50256, 50256, 50256],
        [   40,  8406,   606,   326,   750,  1842,   511,  1499,   338,   922],
        [   45, 12321,    11, 35695,   502,   284, 11906, 10846,   290, 37769]]), 'labels': tensor([[ 1858,   338,   645,   517,   284,   307,   531,    11,   475,   339],
        [  318,  3958,   680,  1549,    11,  -100,  -100,  -100,  -100,  -100],
        [   40,  8406,   606,   326,   750,  1842,   511,  1499,   338,   922],
        [   45, 12321,    11, 35695,   502,   284, 11906, 10846,   290, 37769]])}


In [None]:
for i in range(4):
    print([tokenizer.decode(x) for x in out['input_ids'][i]])


['There', "'s", ' no', ' more', ' to', ' be', ' said', ',', ' but', ' he']
[' is', ' ban', 'ish', "'d", ',', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>']
['I', ' bid', ' them', ' that', ' did', ' love', ' their', ' country', "'s", ' good']
['N', 'urse', ',', ' commend', ' me', ' to', ' thy', ' lady', ' and', ' mistress']


#### Dataloader

In [None]:
test_dl = DataLoader(
    shake_toked['test']['input_ids'],
    batch_size=128,
    collate_fn=data_collator,
    num_workers=0,
)

In [None]:
#!head ../data/text/tiny_shakespeare.txt

In [None]:
b = next(iter(test_dl))
print(b['input_ids'].shape)
print(b['input_ids'][1])
print(b['labels'][1])
# for i in range(128):
#     print([tokenizer.decode(x) for x in b['input_ids'][i]])

torch.Size([128, 10])
tensor([  318,  3958,   680,  1549,    11, 50256, 50256, 50256, 50256, 50256])
tensor([ 318, 3958,  680, 1549,   11, -100, -100, -100, -100, -100])


In [None]:
# from pprint import pprint
# # from config
# conf = OmegaConf.load("../recipes/lm/config/train.yaml")
# pprint(conf.datamodule)
# dm = instantiate(conf.datamodule)
# dm.prepare_data()
# dm.setup()
# print('size of data:', len(dm.dataset), "size of splits: ", [len(x) for x in [dm.data_train, dm.data_val, dm.data_test]])

{'_target_': 'nimrod.text.datasets.CharDataModule', 'text_file': '../data/text/tiny_shakespeare.txt', 'train_val_test_split': [0.8, 0.1, 0.1], 'batch_size': 64, 'num_workers': 0, 'pin_memory': True, 'persistent_workers': False}
size of data: 123933 size of splits:  [99146, 12393, 12394]


## Wikitext-2

### Data source from Hugging Face

In [None]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

In [None]:
print(len(dataset), type(dataset), dataset)
print(dataset[100])

4358 <class 'datasets.arrow_dataset.Dataset'> Dataset({
    features: ['text'],
    num_rows: 4358
})
{'text': ' Du Fu \'s popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi \'s concern for the poor , Lu You \'s patriotism , and Mei Yaochen \'s reflections on the quotidian are a few examples . More broadly , Du Fu \'s work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . \n'}


In [None]:
dataset[100]


{'text': ' Du Fu \'s popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi \'s concern for the poor , Lu You \'s patriotism , and Mei Yaochen \'s reflections on the quotidian are a few examples . More broadly , Du Fu \'s work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . \n'}

### Data source from torchtext
https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
train_iter = WikiText2(root='../data/text', split='test')
tokenizer = get_tokenizer('basic_english')


In [None]:
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# vocab.set_default_index(vocab['<unk>'])
# len(vocab)

In [None]:
# vocab['the']

In [None]:
# vocab(tokenizer('this is a test'))

In [None]:
# # concatenate all sentences together
def data_process(raw_text_iter) -> torch.Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [None]:
# for idx, i in enumerate(train_iter):
#     print(idx, i)
#     print(vocab(tokenizer(i)))

In [None]:
# train_iter, val_iter, test_iter = WikiText2()
# train_data = data_process(train_iter)
# val_data = data_process(val_iter)
# test_data = data_process(test_iter)

In [None]:
def batchify(data: torch.Tensor, bsz: int) -> torch.Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data

In [None]:
# test_data = batchify(test_data, 10)
# print(test_data)


In [None]:
bptt = 35
def get_batch(source: torch.Tensor, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
# x, y = get_batch(test_data, 0)
# print("x: ", x[:2])
# print("y: ", y[:2])

### Word-based tokenization 

#### torchtext tokenizer

In [None]:
# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# # vocab.set_default_index(vocab['<unk>'])
# tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
# tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
# fn_kwargs={'tokenizer': tokenizer})
# print(tokenized_dataset['train'][88]['tokens'])

#### hugging face tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(dataset[100]['text'])
print(tokens)

['Du', 'Fu', "'", 's', 'popularity', 'grew', 'to', 'such', 'an', 'extent', 'that', 'it', 'is', 'as', 'hard', 'to', 'measure', 'his', 'influence', 'as', 'that', 'of', 'Shakespeare', 'in', 'England', ':', 'it', 'was', 'hard', 'for', 'any', 'Chinese', 'poet', 'not', 'to', 'be', 'influenced', 'by', 'him', '.', 'While', 'there', 'was', 'never', 'another', 'Du', 'Fu', ',', 'individual', 'poets', 'followed', 'in', 'the', 'traditions', 'of', 'specific', 'aspects', 'of', 'his', 'work', ':', 'Bai', 'Ju', '##yi', "'", 's', 'concern', 'for', 'the', 'poor', ',', 'Lu', 'You', "'", 's', 'pat', '##riot', '##ism', ',', 'and', 'Mei', 'Yao', '##chen', "'", 's', 'reflections', 'on', 'the', 'q', '##uo', '##ti', '##dian', 'are', 'a', 'few', 'examples', '.', 'More', 'broadly', ',', 'Du', 'Fu', "'", 's', 'work', 'in', 'transforming', 'the', '[UNK]', 'from', 'mere', 'word', 'play', 'into', '"', 'a', 'vehicle', 'for', 'serious', 'poetic', 'utter', '##ance', '"', 'set', 'the', 'stage', 'for', 'every', 'subsequen

### Numericalization

#### torchtext

In [None]:
# vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
# min_freq=3) 
# vocab.insert_token('<unk>', 0)           
# vocab.insert_token('<eos>', 1)            
# vocab.set_default_index(vocab['<unk>'])   
# print(len(vocab))                         
# print(vocab.get_itos()[:10])  

#### hugging face

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
print(tokenizer.decode(ids))


[12786, 14763, 112, 188, 5587, 2580, 1106, 1216, 1126, 6102, 1115, 1122, 1110, 1112, 1662, 1106, 4929, 1117, 2933, 1112, 1115, 1104, 7647, 1107, 1652, 131, 1122, 1108, 1662, 1111, 1251, 1922, 4225, 1136, 1106, 1129, 4401, 1118, 1140, 119, 1799, 1175, 1108, 1309, 1330, 12786, 14763, 117, 2510, 11587, 1723, 1107, 1103, 7181, 1104, 2747, 5402, 1104, 1117, 1250, 131, 27900, 23915, 10279, 112, 188, 4517, 1111, 1103, 2869, 117, 14557, 1192, 112, 188, 26227, 23326, 1863, 117, 1105, 24563, 27762, 10415, 112, 188, 26906, 1113, 1103, 186, 11848, 3121, 10359, 1132, 170, 1374, 5136, 119, 3046, 14548, 117, 12786, 14763, 112, 188, 1250, 1107, 20892, 1103, 100, 1121, 8574, 1937, 1505, 1154, 107, 170, 3686, 1111, 3021, 15751, 15462, 3923, 107, 1383, 1103, 2016, 1111, 1451, 4194, 2432, 1107, 1103, 6453, 119]
Du Fu's popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him. While there w

## Hugging Face for LM without intermediary steps
https://huggingface.co/course/chapter7/6?fw=pt

In [None]:
# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)

print(padded)
print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

{'input_ids': [[101, 1142, 1110, 102], [101, 170, 3087, 102], [101, 119, 102], [101, 1137, 1177, 102], [101, 1122, 3093, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], 'length': [4, 4, 3, 4, 4], 'overflow_to_sample_mapping': [0, 0, 0, 1, 1]}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])
['[CLS] this is [SEP]', '[CLS] a text [SEP]', '[CLS]. [SEP]', '[CLS] or so [SEP]', '[CLS] it seems [SEP]']


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
    print(f"{key} shape: {out[key].shape}")

print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

input_ids shape: torch.Size([5, 4])
labels shape: torch.Size([5, 4])
tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102,    0],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]]) tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102, -100],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]])


### Data loader
Concatenate all data into one large string of text and then chunk it into context length chunks
- https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf
- https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s

In [None]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [None]:
# batch_size = 1024
# train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
# valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
# test_data = get_data(tokenized_dataset['test'], vocab, batch_size)


In [None]:
# print(tokenized_dataset.shape)
# print(tokenized_dataset['train']['tokens'][88])

## Language modeling dataset

Basically concatenate all data into one big array of ids and then create block_lengths inputs. shift for corresponding labels.

In [None]:
print(shake)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})


### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
def tokenize_function(examples:List[dict[str,str]]) -> dict[str, List[List[int]]]:
    result = tokenizer(examples["text"]) #, max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:

tokenized = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

In [None]:
print(tokenized['train'], type(tokenized['train']))


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 32000
}) <class 'datasets.arrow_dataset.Dataset'>


### Sentences concatenation

In [None]:
all = []
for k,v in tokenized.items():
    print(k, v)
    for x in v['input_ids']:
        all += x
    print(len(all))
print(all[:15])

train Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 32000
})
238064
test Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 4000
})
267859
valid Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 4000
})
298027
[8496, 674, 826, 46258, 2988, 318, 1716, 13, 1870, 373, 379, 938, 503, 12, 24903]


### Batchify

In [None]:
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y

In [None]:
x, y = get_batch(np.array(all), 16, 10)
print(x.shape, y.shape)
print(x[0], y[0])
print(tokenizer.decode(x[0]), tokenizer.decode(y[0]))

torch.Size([16, 10]) torch.Size([16, 10])
tensor([  790, 32460, 33769,   314,   787,   817,   272,   428,  4939,  3211]) tensor([32460, 33769,   314,   787,   817,   272,   428,  4939,  3211,    25])
 every tedious stride I makeThan this weak arm  tedious stride I makeThan this weak arm:


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()