## How to build the language model dataset

## Text8 dataset

text8 dataset is a long string. we should use text8 to build a langauge model task.

In [1]:
import pandas as pd 

with open("../data/text8", "r", encoding="utf-8") as f:
    data = f.read()
len(data)

100000000

In [2]:
# get the train val and test dataset
num_test_chars = 5000000
train_data = data[: -2 * num_test_chars]
valid_data = data[-2 * num_test_chars : -num_test_chars]
test_data = data[-num_test_chars:]

In [50]:
# assume the data is a long string which has to be divied into batches containing examples of
from math import ceil
from tqdm import tqdm
def _split_example(data, *, number_of_splits=None, length_of_each_split=None):
    """
    Divides the string into n parts. If it's not possible to divide the string into n EQUAL parts,
    then nth part will be smaller than the first n-1 parts(which will all be of equal lengths)
    :param data: The string to be divided into n parts
    :param number_of_splits: The number of parts to divide the string into
    """
    if not number_of_splits and not length_of_each_split:
        raise ValueError(
            "At least one of the two keyword arguments must be provided"
        )
    split_length = length_of_each_split or ceil(len(data) / number_of_splits)
    num_splits = number_of_splits or ceil(len(data) / split_length)
    return [
        data[i * split_length : min(len(data), (i + 1) * split_length)]
        for i in range(num_splits)
    ]
x = _split_example(train_data, number_of_splits=128)

In [28]:
print(len(x))
print(len(x[0]))

128
703125


In [34]:
# build vocab
import string
string.ascii_lowercase + " "

END_OF_SENTENCE_TOKEN = "<EOS>"
OUT_OF_VOCAB_TOKEN = "<OOV>"
PADDING_TOKEN = "<PAD>"
vocab = [END_OF_SENTENCE_TOKEN,OUT_OF_VOCAB_TOKEN,PADDING_TOKEN]
vocab = vocab + list(set(string.ascii_lowercase + " "))
_vocab = {token: id for id, token in enumerate(vocab)}
_inverse_vocab = {id:token for id, token in enumerate(vocab)}

def token2id(token):
    return _vocab.get(token,_vocab[OUT_OF_VOCAB_TOKEN])
def id2token(_id):
    return _inverse_vocab.get(_id, OUT_OF_VOCAB_TOKEN)
def map_tokens_to_ids(tokens):
    return [token2id(token) for token in tokens]
def tokenize(data: list):
    """Maps characters to integers"""
    return [map_tokens_to_ids(tokens) for tokens in tqdm(data)]

In [35]:
x_tokenize =  tokenize(x)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 128/128 [00:15<00:00,  8.10it/s]


In [37]:
len(x_tokenize)

128

In [44]:
# create the target sequence 
def create_target_sequences(data: list):
    return [x[1:] + [token2id(END_OF_SENTENCE_TOKEN)] for x in data]

In [46]:
y = create_target_sequences(x_tokenize)
len(y)

128

In [51]:
# now x has a single batch each batch in x is very long, so we need to split the exmple 
# into n parts. 
sequence_length = 100

x_tokenize_split = [
    _split_example(example, length_of_each_split=sequence_length)
    for example in tqdm(x_tokenize)
]


100%|██████████| 128/128 [00:06<00:00, 20.51it/s]


In [53]:
y = [
    _split_example(example, length_of_each_split=sequence_length)
        for example in tqdm(y)
    ]

100%|██████████| 128/128 [00:00<00:00, 2990.51it/s]


In [65]:
len(y[0])
len(y[67])

71

In [66]:
# build the train dataset
batch_size = 128
max_number_of_splits = max((len(i) for i in x_tokenize_split))

print(max_number_of_splits)
# X_train = []
# y_train = []
# lengths = []
# for i in range(max_number_of_splits):
#     for j in range(batch_size):
#         if i < len(x_tokenize_split[j]):
#             X_train.append(x_tokenize_split[j][i])
           
#             y_train.append(y[j][i])
#             lengths.append(len(x_tokenize_split[j][i]))

7032


In [None]:
for i in x_tokenize_split:
    print(i)