# Text datasets

In [2]:
#| default_exp text.datasets

In [3]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [34]:
#| export

# torch
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim import SGD
import torchtext
from torchtext.vocab import vocab

# hf
import datasets
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# data 
import pandas as pd
import numpy as np

# ui
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

# python
from typing import Dict, List, Tuple, Optional, Set
from collections import Counter, OrderedDict
from dataclasses import dataclass, asdict
from plum import dispatch

# nimrod
from nimrod.models.lm import Vocab

## Wikitext-2

### Data source from Hugging Face

In [5]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')

In [10]:
print(len(dataset), type(dataset), dataset.keys())
print(len(dataset['train']),dataset['train'][88])

3 <class 'datasets.dataset_dict.DatasetDict'> dict_keys(['test', 'train', 'validation'])
36718 {'text': ' This ammunition , and that which I brought with me , was rapidly prepared for use at the Laboratory established at the Little Rock Arsenal for that purpose . As illustrating as the pitiful scarcity of material in the country , the fact may be stated that it was found necessary to use public documents of the State Library for cartridge paper . Gunsmiths were employed or conscripted , tools purchased or impressed , and the repair of the damaged guns I brought with me and about an equal number found at Little Rock commenced at once . But , after inspecting the work and observing the spirit of the men I decided that a garrison 500 strong could hold out against Fitch and that I would lead the remainder - about 1500 - to Gen \'l Rust as soon as shotguns and rifles could be obtained from Little Rock instead of pikes and lances , with which most of them were armed . Two days elapsed before

### Word-based tokenization 

#### torchtext tokenizer

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][88]['tokens'])

['this', 'ammunition', ',', 'and', 'that', 'which', 'i', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'laboratory', 'established', 'at', 'the', 'little', 'rock', 'arsenal', 'for', 'that', 'purpose', '.', 'as', 'illustrating', 'as', 'the', 'pitiful', 'scarcity', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'state', 'library', 'for', 'cartridge', 'paper', '.', 'gunsmiths', 'were', 'employed', 'or', 'conscripted', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'i', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'little', 'rock', 'commenced', 'at', 'once', '.', 'but', ',', 'after', 'inspecting', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'men', 'i', 'decided', 'that', 'a', 'garrison', '500', 'strong', 'co

#### hugging face tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(dataset['train'][88]['text'])
print(tokens)

['This', 'ammunition', ',', 'and', 'that', 'which', 'I', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'Laboratory', 'established', 'at', 'the', 'Little', 'Rock', 'Arsenal', 'for', 'that', 'purpose', '.', 'As', 'ill', '##ust', '##rating', 'as', 'the', 'pit', '##iful', 'scar', '##city', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'State', 'Library', 'for', 'cartridge', 'paper', '.', 'Guns', '##mith', '##s', 'were', 'employed', 'or', 'con', '##s', '##cript', '##ed', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'I', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'Little', 'Rock', 'commenced', 'at', 'once', '.', 'But', ',', 'after', 'inspect', '##ing', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'men', '

### Numericalization

#### torchtext

In [None]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])  

29473
['<unk>', '<eos>', 'the', ',', '.', 'of', 'and', 'in', 'to', 'a']


#### hugging face

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
print(tokenizer.decode(ids))


[1188, 9448, 117, 1105, 1115, 1134, 146, 1814, 1114, 1143, 117, 1108, 5223, 4029, 1111, 1329, 1120, 1103, 8891, 1628, 1120, 1103, 2743, 2977, 10503, 1111, 1115, 3007, 119, 1249, 5178, 8954, 7969, 1112, 1103, 7172, 17126, 14161, 9041, 1104, 2578, 1107, 1103, 1583, 117, 1103, 1864, 1336, 1129, 2202, 1115, 1122, 1108, 1276, 3238, 1106, 1329, 1470, 4961, 1104, 1103, 1426, 3371, 1111, 16542, 2526, 119, 18270, 17740, 1116, 1127, 4071, 1137, 14255, 1116, 13590, 1174, 117, 5537, 3310, 1137, 7351, 117, 1105, 1103, 6949, 1104, 1103, 4938, 3832, 146, 1814, 1114, 1143, 1105, 1164, 1126, 4463, 1295, 1276, 1120, 2743, 2977, 8042, 1120, 1517, 119, 1252, 117, 1170, 25151, 1158, 1103, 1250, 1105, 15639, 1103, 4840, 1104, 1103, 1441, 146, 1879, 1115, 170, 10609, 2260, 2012, 1180, 2080, 1149, 1222, 17355, 6943, 1105, 1115, 146, 1156, 1730, 1103, 6311, 118, 1164, 10204, 118, 1106, 9198, 112, 181, 155, 8954, 1112, 1770, 1112, 15210, 1116, 1105, 12385, 1180, 1129, 3836, 1121, 2743, 2977, 1939, 1104, 185, 13

## Hugging Face for LM without intermediary steps
https://huggingface.co/course/chapter7/6?fw=pt

In [None]:
# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)

print(padded)
print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

{'input_ids': [[101, 1142, 1110, 102], [101, 170, 3087, 102], [101, 119, 102], [101, 1137, 1177, 102], [101, 1122, 3093, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], 'length': [4, 4, 3, 4, 4], 'overflow_to_sample_mapping': [0, 0, 0, 1, 1]}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])
['[CLS] this is [SEP]', '[CLS] a text [SEP]', '[CLS]. [SEP]', '[CLS] or so [SEP]', '[CLS] it seems [SEP]']


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
    print(f"{key} shape: {out[key].shape}")

print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

input_ids shape: torch.Size([5, 4])
labels shape: torch.Size([5, 4])
tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102,    0],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]]) tensor([[ 101, 1142, 1110,  102],
        [ 101,  170, 3087,  102],
        [ 101,  119,  102, -100],
        [ 101, 1137, 1177,  102],
        [ 101, 1122, 3093,  102]])


### Data loader
Concatenate all data into one large string of text and then chunk it into context length chunks
- https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf
- https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s

In [None]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [None]:
batch_size = 1024
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)


In [None]:
print(tokenized_dataset.shape)
print(tokenized_dataset['train']['tokens'][88])

{'test': (4358, 1), 'train': (36718, 1), 'validation': (3760, 1)}
['this', 'ammunition', ',', 'and', 'that', 'which', 'i', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'laboratory', 'established', 'at', 'the', 'little', 'rock', 'arsenal', 'for', 'that', 'purpose', '.', 'as', 'illustrating', 'as', 'the', 'pitiful', 'scarcity', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'state', 'library', 'for', 'cartridge', 'paper', '.', 'gunsmiths', 'were', 'employed', 'or', 'conscripted', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'i', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'little', 'rock', 'commenced', 'at', 'once', '.', 'but', ',', 'after', 'inspecting', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'm

## Create custom dataset for HF
https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt

### Load text file

In [28]:
dataset = load_dataset("text", data_files="../data/text/tiny_shakespeare.txt") #, split=['train','dev','test'])
print(dataset)
full = dataset['train']

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 40000
    })
})


In [39]:
train_test = full.train_test_split(train_size=0.8)
test_valid = train_test['test'].train_test_split(train_size=0.5)
shake = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(shake)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})


In [40]:
shake['test'][0]

{'text': 'How doth the prince, and my young son of York?'}

### Tokenize / Numericalize

#### Tokenize single element

In [72]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print("vocab size: ", len(tokenizer))

num_examples = 3
context_length = 10

tokens = tokenizer.tokenize(shake['test'][0]['text'])
print(tokens)

padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print(padded)
# print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

vocab size:  28996
['How', 'dot', '##h', 'the', 'prince', ',', 'and', 'my', 'young', 'son', 'of', 'York', '?']
{'input_ids': [[101, 1731, 15645, 1324, 1103, 6927, 117, 1105, 1139, 102], [101, 1685, 1488, 1104, 1365, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]], 'length': [10, 7], 'overflow_to_sample_mapping': [0, 0]}
['[CLS] How doth the prince, and my [SEP]', '[CLS] young son of York? [SEP]']


#### Tokenize whole dataset

In [60]:
# tokenize the whole dataset
def tokenize_function(examples):
    result = tokenizer(examples["text"], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [68]:
shake_toked = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"]
)

In [69]:
print(shake_toked['test'][0])
print([tokenizer.decode(x) for x in shake_tok['test'][0]['input_ids']])

{'input_ids': [101, 1731, 15645, 1324, 1103, 6927, 117, 1105, 1139, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': 10, 'overflow_to_sample_mapping': 0, 'word_ids': [None, 0, 1, 1, 2, 3, 4, 5, 6, None]}
['[CLS]', 'How', 'dot', '##h', 'the', 'prince', ',', 'and', 'my', '[SEP]']


### Collate data into batches

In [81]:
print(tokenizer.pad_token, tokenizer.eos_token)


[PAD] [PAD]


In [82]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# out = data_collator([shake_toked["test"][i] for i in range(5)])
# for key in out:
#     print(f"{key} shape: {out[key].shape}")

# out = data_collator(padded['input_ids'])
# for key in out:
#     print(f"{key} shape: {out[key].shape}")

# print(out['input_ids'], out['labels'])


In [101]:
out = data_collator([shake_toked['test']['input_ids'][i] for i in range(4)])
print(out)

{'input_ids': tensor([[  101,  1731, 15645,  1324,  1103,  6927,   117,  1105,  1139,   102],
        [  101,  1685,  1488,  1104,  1365,   136,   102,     0,     0,     0],
        [  101,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   152,   117,  1519,  1143, 13805,  1196,   146,  1321,   102]]), 'labels': tensor([[  101,  1731, 15645,  1324,  1103,  6927,   117,  1105,  1139,   102],
        [  101,  1685,  1488,  1104,  1365,   136,   102,  -100,  -100,  -100],
        [  101,   102,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [  101,   152,   117,  1519,  1143, 13805,  1196,   146,  1321,   102]])}


In [103]:
for i in range(4):
    print([tokenizer.decode(x) for x in out['input_ids'][i]])


['[CLS]', 'How', 'dot', '##h', 'the', 'prince', ',', 'and', 'my', '[SEP]']
['[CLS]', 'young', 'son', 'of', 'York', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'O', ',', 'let', 'me', 'pray', 'before', 'I', 'take', '[SEP]']


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()