# Text datasets

In [None]:
#| default_exp text.datasets

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export

# torch
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim import SGD
import torchtext
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, dataset, Dataset

# pl
from lightning import LightningDataModule

# hf
import datasets
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, DefaultDataCollator, default_data_collator

# data 
import pandas as pd
import numpy as np

# ui
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

# python
from typing import Dict, List, Tuple, Optional, Set, Union
from collections import Counter, OrderedDict
from dataclasses import dataclass, asdict
from plum import dispatch
import urllib
import math
import random

# nimrod
# from nimrod.models.lm import Vocab

## Tiny shakespeare

### Char Dataset
After Karpathy chatGPT tutorial

In [None]:
#| export 

class CharDataset(Dataset):
    def __init__(self,
                data: str, # text as a long continuous string
                block_size: int # context length
                ):
        chars = list(set(data))
        data_size, vocab_size = len(data), len(chars)

        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self) -> int:
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        i = random.randint(0, len(self.data) - (self.block_size + 1))
        chunk = self.data[i : i + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

    def to_tokens(self, message: str) -> torch.Tensor:
        return torch.tensor([self.stoi[s] for s in message], dtype=torch.long)

    def from_tokens(self, tokens: torch.Tensor) -> str:
        return "".join([self.itos[int(i)] for i in tokens])

#### Usage

In [None]:
# with urllib.request.urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt") as f:
#     text = f.read().decode("utf-8")

with open('../data/text/tiny_shakespeare.txt') as f:
    text = f.read()

print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
block_size = 10
ds = CharDataset(text, block_size)
x,y = ds[0]

print("x:",  ds.from_tokens(x), "\ny:", ds.from_tokens(y))

x:  speak. Yo 
y: speak. You


### Hugging Face
https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt

#### Load text file

In [None]:
dataset = load_dataset("text", data_files="../data/text/tiny_shakespeare.txt") #, split=['train','dev','test'])
print(dataset)
full = dataset['train']

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 40000
    })
})


In [None]:
train_test = full.train_test_split(train_size=0.8)
test_valid = train_test['test'].train_test_split(train_size=0.5)
shake = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(shake)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})


In [None]:
shake['test'][0]

{'text': 'Though he divide the realm and give thee half,'}

#### Tokenization / Numericalization

##### Tokenize single element

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

print("vocab size: ", len(tokenizer))
print("text row 0: ", shake['test'][0]['text'])
tokens = tokenizer.tokenize(shake['test'][0]['text'])
print("tokens of row 0: ", tokens)

context_length = 10
padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print("context block & padding for lm: ", padded)
# print(padded.keys())
print('decode single input_id: ', tokenizer.decode(849))
print([tokenizer.decode(x) for x in padded['input_ids']])

vocab size:  50257
text row 0:  Though he divide the realm and give thee half,
tokens of row 0:  ['Though', 'Ġhe', 'Ġdivide', 'Ġthe', 'Ġrealm', 'Ġand', 'Ġgive', 'Ġthee', 'Ġhalf', ',']
context block & padding for lm:  {'input_ids': [[10915, 339, 14083, 262, 13360, 290, 1577, 17903, 2063, 11]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'length': [10], 'overflow_to_sample_mapping': [0]}
decode single input_id:  oth
['Though he divide the realm and give thee half,']


##### Tokenize whole dataset using map

In [None]:
from omegaconf import OmegaConf

In [None]:
cfg = {
    "context_length": 10,
    "truncation": True,
    "return_length": True,
    "return_overflowing_tokens": True,
}

cfg = OmegaConf.create(cfg)

# tokenizer function called via dataset map
def tokenize_function(examples:List[dict[str,str]], cfg:OmegaConf=cfg) -> dict[str, List[List[int]]]:
    result = tokenizer(
        examples["text"],
        max_length=cfg.context_length,
        truncation=cfg.truncation,
        return_length=cfg.return_length,
        return_overflowing_tokens=cfg.return_overflowing_tokens
        )
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenize_function(shake['test'][0])

{'input_ids': [[10915, 339, 14083, 262, 13360, 290, 1577, 17903, 2063, 11]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'length': [10], 'overflow_to_sample_mapping': [0], 'word_ids': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]}

In [None]:
shake_toked = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

Map: 100%|██████████| 32000/32000 [00:01<00:00, 31834.27 examples/s]
Map: 100%|██████████| 4000/4000 [00:00<00:00, 58160.32 examples/s]
Map: 100%|██████████| 4000/4000 [00:00<00:00, 60424.69 examples/s]


In [None]:
print(shake_toked['test'][0])
print([tokenizer.decode(x) for x in shake_toked['test'][0]['input_ids']])
print(tokenizer.decode(shake_toked['test'][0]['input_ids']))

{'input_ids': [10915, 339, 14083, 262, 13360, 290, 1577, 17903, 2063, 11], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': 10, 'overflow_to_sample_mapping': 0, 'word_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
['Though', ' he', ' divide', ' the', ' realm', ' and', ' give', ' thee', ' half', ',']
Though he divide the realm and give thee half,


In [None]:
for split, dset in shake_toked.items():
    print(split, dset)
    arr_len = np.sum(dset['length'], dtype=np.uint64)


train Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 42224
})
test Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5313
})
valid Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5297
})


#### Data Collator

In [None]:
print(tokenizer.pad_token, tokenizer.eos_token)


None <|endoftext|>


In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([shake_toked["test"]['input_ids'][i] for i in range(5)])
# out = default_data_collator(shake_toked['test']['input_ids'][0])
print(out)
# for key in out:
#     print(f"{key} shape: {out[key].shape}")
# print('inputs: ', out['input_ids'])
# print('labels: ', out['labels'])

# data_collator = DefaultDataCollator(tokenizer)
# out = data_collator([shake_toked["test"][i] for i in range(5)])
# print(out)



{'input_ids': tensor([[10915,   339, 14083,   262, 13360,   290,  1577, 17903,  2063,    11],
        [ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11],
        [ 1537,   910,    11,   318, 49398,  2460,   351, 19579,    30, 50256],
        [ 2601,   593,    25, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [11633,  3326,   813,  6842,  2910,    30, 50256, 50256, 50256, 50256]]), 'labels': tensor([[10915,   339, 14083,   262, 13360,   290,  1577, 17903,  2063,    11],
        [ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11],
        [ 1537,   910,    11,   318, 49398,  2460,   351, 19579,    30,  -100],
        [ 2601,   593,    25,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [11633,  3326,   813,  6842,  2910,    30,  -100,  -100,  -100,  -100]])}


In [None]:
def my_collate(examples, block_size: int, **kwargs):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
example = shake['test'][0]
# concatenated_examples = {k: sum(example[k], []) for k in example.keys()}
print([example[k] for k in example.keys()])

['Though he divide the realm and give thee half,']


In [None]:
out = data_collator([shake_toked['test']['input_ids'][i] for i in range(4)])
print(out)

{'input_ids': tensor([[10915,   339, 14083,   262, 13360,   290,  1577, 17903,  2063,    11],
        [ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11],
        [ 1537,   910,    11,   318, 49398,  2460,   351, 19579,    30, 50256],
        [ 2601,   593,    25, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'labels': tensor([[10915,   339, 14083,   262, 13360,   290,  1577, 17903,  2063,    11],
        [ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11],
        [ 1537,   910,    11,   318, 49398,  2460,   351, 19579,    30,  -100],
        [ 2601,   593,    25,  -100,  -100,  -100,  -100,  -100,  -100,  -100]])}


In [None]:
for i in range(4):
    print([tokenizer.decode(x) for x in out['input_ids'][i]])


['Though', ' he', ' divide', ' the', ' realm', ' and', ' give', ' thee', ' half', ',']
['Now', ' hath', ' my', ' soul', ' brought', ' forth', ' her', ' pro', 'digy', ',']
['But', ' say', ',', ' is', ' Warwick', ' friends', ' with', ' Margaret', '?', '<|endoftext|>']
['Cl', 'own', ':', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>']


#### Dataloader

In [None]:
test_dl = DataLoader(
    shake_toked['test']['input_ids'],
    batch_size=128,
    collate_fn=data_collator,
    num_workers=0,
)

In [None]:
#!head ../data/text/tiny_shakespeare.txt

In [None]:
b = next(iter(test_dl))
print(b['input_ids'].shape)
print(b['input_ids'][1])
print(b['labels'][1])
# for i in range(128):
#     print([tokenizer.decode(x) for x in b['input_ids'][i]])

torch.Size([128, 10])
tensor([ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11])
tensor([ 3844, 22027,   616,  5848,  3181,  6071,   607,   386, 41923,    11])


## Wikitext-2

### Data source from Hugging Face

In [None]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

In [None]:
print(len(dataset), type(dataset), dataset)
print(dataset[100])

4358 <class 'datasets.arrow_dataset.Dataset'> Dataset({
    features: ['text'],
    num_rows: 4358
})
{'text': ' Du Fu \'s popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi \'s concern for the poor , Lu You \'s patriotism , and Mei Yaochen \'s reflections on the quotidian are a few examples . More broadly , Du Fu \'s work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . \n'}


In [None]:
dataset[100]


{'text': ' Du Fu \'s popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi \'s concern for the poor , Lu You \'s patriotism , and Mei Yaochen \'s reflections on the quotidian are a few examples . More broadly , Du Fu \'s work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . \n'}

### Data source from torchtext
https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
train_iter = WikiText2(root='../data/text', split='test')
tokenizer = get_tokenizer('basic_english')


In [None]:
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# vocab.set_default_index(vocab['<unk>'])
# len(vocab)

In [None]:
# vocab['the']

1

In [None]:
# vocab(tokenizer('this is a test'))

[35, 23, 8, 1660]

In [None]:
# # concatenate all sentences together
def data_process(raw_text_iter) -> torch.Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [None]:
# for idx, i in enumerate(train_iter):
#     print(idx, i)
#     print(vocab(tokenizer(i)))

In [None]:
# train_iter, val_iter, test_iter = WikiText2()
# train_data = data_process(train_iter)
# val_data = data_process(val_iter)
# test_data = data_process(test_iter)

In [None]:
def batchify(data: torch.Tensor, bsz: int) -> torch.Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data

In [None]:
# test_data = batchify(test_data, 10)
# print(test_data)


tensor([[    9,   292,   591,  ...,     3,    10,  5060],
        [  632,   132, 16641,  ..., 27763, 27275,     1],
        [    0,    24,    21,  ...,  1128,     3,  2251],
        ...,
        [  746,  2291,  7225,  ...,    91,  2168,  3855],
        [    5,    16, 12691,  ...,     0,    31,     2],
        [    1,    34,     4,  ...,     0,  3416,  4415]])


In [None]:
bptt = 35
def get_batch(source: torch.Tensor, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
# x, y = get_batch(test_data, 0)
# print("x: ", x[:2])
# print("y: ", y[:2])

x:  tensor([[    9,   292,   591,  5361,     5,    46,     9,     3,    10,  5060],
        [  632,   132, 16641,     5,  1586,   270,  2113, 27763, 27275,     1]])
y:  tensor([[  632,   132, 16641,     5,  1586,   270,  2113, 27763, 27275,     1],
        [    0,    24,    21,  1401,    12,    14, 18527,  1128,     3,  2251]])


### Word-based tokenization 

#### torchtext tokenizer

In [None]:
# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# # vocab.set_default_index(vocab['<unk>'])
# tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
# tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
# fn_kwargs={'tokenizer': tokenizer})
# print(tokenized_dataset['train'][88]['tokens'])



KeyboardInterrupt: 

#### hugging face tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(dataset[100]['text'])
print(tokens)

['Du', 'Fu', "'", 's', 'popularity', 'grew', 'to', 'such', 'an', 'extent', 'that', 'it', 'is', 'as', 'hard', 'to', 'measure', 'his', 'influence', 'as', 'that', 'of', 'Shakespeare', 'in', 'England', ':', 'it', 'was', 'hard', 'for', 'any', 'Chinese', 'poet', 'not', 'to', 'be', 'influenced', 'by', 'him', '.', 'While', 'there', 'was', 'never', 'another', 'Du', 'Fu', ',', 'individual', 'poets', 'followed', 'in', 'the', 'traditions', 'of', 'specific', 'aspects', 'of', 'his', 'work', ':', 'Bai', 'Ju', '##yi', "'", 's', 'concern', 'for', 'the', 'poor', ',', 'Lu', 'You', "'", 's', 'pat', '##riot', '##ism', ',', 'and', 'Mei', 'Yao', '##chen', "'", 's', 'reflections', 'on', 'the', 'q', '##uo', '##ti', '##dian', 'are', 'a', 'few', 'examples', '.', 'More', 'broadly', ',', 'Du', 'Fu', "'", 's', 'work', 'in', 'transforming', 'the', '[UNK]', 'from', 'mere', 'word', 'play', 'into', '"', 'a', 'vehicle', 'for', 'serious', 'poetic', 'utter', '##ance', '"', 'set', 'the', 'stage', 'for', 'every', 'subsequen



### Numericalization

#### torchtext

In [None]:
# vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
# min_freq=3) 
# vocab.insert_token('<unk>', 0)           
# vocab.insert_token('<eos>', 1)            
# vocab.set_default_index(vocab['<unk>'])   
# print(len(vocab))                         
# print(vocab.get_itos()[:10])  

#### hugging face

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
print(tokenizer.decode(ids))


[12786, 14763, 112, 188, 5587, 2580, 1106, 1216, 1126, 6102, 1115, 1122, 1110, 1112, 1662, 1106, 4929, 1117, 2933, 1112, 1115, 1104, 7647, 1107, 1652, 131, 1122, 1108, 1662, 1111, 1251, 1922, 4225, 1136, 1106, 1129, 4401, 1118, 1140, 119, 1799, 1175, 1108, 1309, 1330, 12786, 14763, 117, 2510, 11587, 1723, 1107, 1103, 7181, 1104, 2747, 5402, 1104, 1117, 1250, 131, 27900, 23915, 10279, 112, 188, 4517, 1111, 1103, 2869, 117, 14557, 1192, 112, 188, 26227, 23326, 1863, 117, 1105, 24563, 27762, 10415, 112, 188, 26906, 1113, 1103, 186, 11848, 3121, 10359, 1132, 170, 1374, 5136, 119, 3046, 14548, 117, 12786, 14763, 112, 188, 1250, 1107, 20892, 1103, 100, 1121, 8574, 1937, 1505, 1154, 107, 170, 3686, 1111, 3021, 15751, 15462, 3923, 107, 1383, 1103, 2016, 1111, 1451, 4194, 2432, 1107, 1103, 6453, 119]
Du Fu's popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him. While there w

## Hugging Face for LM without intermediary steps
https://huggingface.co/course/chapter7/6?fw=pt

In [None]:
# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)

print(padded)
print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

{'input_ids': [[101, 1142, 1110, 102], [101, 170, 3087, 102], [101, 119, 102], [101, 1137, 1177, 102], [101, 1122, 3093, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], 'length': [4, 4, 3, 4, 4], 'overflow_to_sample_mapping': [0, 0, 0, 1, 1]}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])
['[CLS] this is [SEP]', '[CLS] a text [SEP]', '[CLS]. [SEP]', '[CLS] or so [SEP]', '[CLS] it seems [SEP]']


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
    print(f"{key} shape: {out[key].shape}")

print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

input_ids shape: torch.Size([5, 4])
labels shape: torch.Size([5, 4])
tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102,    0],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]]) tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102, -100],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]])


### Data loader
Concatenate all data into one large string of text and then chunk it into context length chunks
- https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf
- https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s

In [None]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [None]:
# batch_size = 1024
# train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
# valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
# test_data = get_data(tokenized_dataset['test'], vocab, batch_size)


In [None]:
# print(tokenized_dataset.shape)
# print(tokenized_dataset['train']['tokens'][88])

{'test': (4358, 1), 'train': (36718, 1), 'validation': (3760, 1)}
['this', 'ammunition', ',', 'and', 'that', 'which', 'i', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'laboratory', 'established', 'at', 'the', 'little', 'rock', 'arsenal', 'for', 'that', 'purpose', '.', 'as', 'illustrating', 'as', 'the', 'pitiful', 'scarcity', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'state', 'library', 'for', 'cartridge', 'paper', '.', 'gunsmiths', 'were', 'employed', 'or', 'conscripted', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'i', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'little', 'rock', 'commenced', 'at', 'once', '.', 'but', ',', 'after', 'inspecting', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'm

## Language modeling dataset

Basically concatenate all data into one big array of ids and then create block_lengths inputs. shift for corresponding labels.

In [None]:
print(shake)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})


### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
def tokenize_function(examples:List[dict[str,str]]) -> dict[str, List[List[int]]]:
    result = tokenizer(examples["text"]) #, max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:

tokenized = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

In [None]:
print(tokenized['train'], type(tokenized['train']))


Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 42185
}) <class 'datasets.arrow_dataset.Dataset'>


### Sentences concatenation

In [None]:
all = []
for k,v in tokenized.items():
    print(k, v)
    for x in v['input_ids']:
        all += x
    print(len(all))
print(all[:15])

train Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 42185
})
237757
test Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5335
})
268006
valid Dataset({
    features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'word_ids'],
    num_rows: 5314
})
298027
[3844, 11, 416, 262, 995, 11, 340, 318, 257, 22279, 88, 266, 24421, 26, 35]


### Batchify

In [None]:
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y

In [None]:
x, y = get_batch(np.array(all), 16, 10)
print(x.shape, y.shape)
print(x[0], y[0])
print(tokenizer.decode(x[0]), tokenizer.decode(y[0]))

torch.Size([16, 10]) torch.Size([16, 10])
tensor([  750, 16180,   290,  7898,   683,    11,    40,   423,   257,  3956]) tensor([16180,   290,  7898,   683,    11,    40,   423,   257,  3956,   318])
 did threaten and encourage him,I have a brother  threaten and encourage him,I have a brother is


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()