<a href="https://colab.research.google.com/github/samitha278/CoreLlama/blob/main/test_dataset_llama2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import torch
import torch.nn as nn

from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader

In [67]:
from transformers import AutoTokenizer

## Dataset from Hugging Face

In [68]:
dataset = load_dataset("roneneldan/TinyStories")

In [69]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [70]:
train_ds = dataset['train']
train_ds

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [71]:
val_ds = dataset['validation']
val_ds

Dataset({
    features: ['text'],
    num_rows: 21990
})

In [72]:
train_ds[0]

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}

### Tokenizer -gpt2

In [73]:
import tiktoken

In [74]:
tokenizer_gpt2 = tiktoken.get_encoding('gpt2')

In [75]:
tokens_gpt2 = tokenizer_gpt2.encode(train_ds[0]['text'])
print(tokens_gpt2)
print(len(tokens_gpt2))

[3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257, 17598, 287, 607, 2119, 13, 1375, 2993, 340, 373, 2408, 284, 711, 351, 340, 780, 340, 373, 7786, 13, 20037, 2227, 284, 2648, 262, 17598, 351, 607, 1995, 11, 523, 673, 714, 34249, 257, 4936, 319, 607, 10147, 13, 198, 198, 43, 813, 1816, 284, 607, 1995, 290, 531, 11, 366, 29252, 11, 314, 1043, 428, 17598, 13, 1680, 345, 2648, 340, 351, 502, 290, 34249, 616, 10147, 1701, 2332, 1995, 13541, 290, 531, 11, 366, 5297, 11, 20037, 11, 356, 460, 2648, 262, 17598, 290, 4259, 534, 10147, 526, 198, 198, 41631, 11, 484, 4888, 262, 17598, 290, 384, 19103, 262, 4936, 319, 20037, 338, 10147, 13, 632, 373, 407, 2408, 329, 606, 780, 484, 547, 7373, 290, 5742, 1123, 584, 13, 2293, 484, 5201, 11, 20037, 26280, 607, 1995, 329, 7373, 262, 17598, 290, 18682, 607, 10147, 13, 1119, 1111, 2936, 3772, 780, 484, 550, 4888, 290, 3111, 1978, 13]
162


## Llama 2 tokenizer - community version

In [76]:
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")

In [77]:
tokenizer.vocab_size

32000

In [78]:
tokens = tokenizer.encode(train_ds[0]['text'])
print(tokens)
print(len(tokens))

[1, 3118, 2462, 29892, 263, 2217, 7826, 4257, 365, 2354, 1476, 263, 817, 280, 297, 902, 5716, 29889, 2296, 6363, 372, 471, 5189, 304, 1708, 411, 372, 1363, 372, 471, 15301, 29889, 365, 2354, 5131, 304, 6232, 278, 817, 280, 411, 902, 16823, 29892, 577, 1183, 1033, 409, 29893, 263, 2826, 373, 902, 528, 2728, 29889, 13, 13, 29931, 2354, 3512, 304, 902, 16823, 322, 1497, 29892, 376, 29924, 290, 29892, 306, 1476, 445, 817, 280, 29889, 1815, 366, 6232, 372, 411, 592, 322, 409, 29893, 590, 528, 2728, 3026, 2439, 16823, 25156, 322, 1497, 29892, 376, 8241, 29892, 365, 2354, 29892, 591, 508, 6232, 278, 817, 280, 322, 2329, 596, 528, 2728, 1213, 13, 13, 29911, 12966, 29892, 896, 7258, 278, 817, 280, 322, 409, 8734, 278, 2826, 373, 365, 2354, 29915, 29879, 528, 2728, 29889, 739, 471, 451, 5189, 363, 963, 1363, 896, 892, 19383, 322, 19912, 1269, 916, 29889, 2860, 896, 7743, 29892, 365, 2354, 6452, 287, 902, 16823, 363, 19383, 278, 817, 280, 322, 27826, 902, 528, 2728, 29889, 2688, 1716, 7091, 9796,

```
Special Tokens:

<s> (BOS - Beginning of Sequence) - Marks the start of text
</s> (EOS - End of Sequence) - Marks the end of text
<unk> (Unknown) - Represents unknown/out-of-vocabulary words
<pad> (Padding) - Used to pad sequences to the same length


When tokenize text with add_special_tokens=True :

text = "Hello world"
tokens = tokenizer.encode(text, add_special_tokens=True)
# Result: [1, 15043, 3186, 2]
# Where: 1 = <s>, 15043 = "Hello", 3186 = "world", 2 = </s>


With add_special_tokens=False:

tokens = tokenizer.encode(text, add_special_tokens=False)
# Result: [15043, 3186]
# Just the actual words, no BOS/EOS tokens

```



## Dataset wrapper

In [79]:
class TextDataset(Dataset):

    def __init__(self,dataset,tokenizer,max_len):

        self.dataset = dataset

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):

        text = self.dataset[idx]['text']

        # tokenize
        tokens = self.tokenizer.encode(text, add_special_tokens=True)
        l = len(tokens)

        # truncate if tokens long
        if self.max_len < l:
            tokens = tokens[:self.max_len]
        else: # pad (if max_len > l)
            tokens = tokens + [self.tokenizer.pad_token_id] * (self.max_len-l)

        # convert to tensor
        tokens = torch.tensor(tokens)
        x = tokens[:-1]
        y = tokens[1:]

        return x,y

## Data Loader

In [80]:
max_len = 512

In [81]:
# Hugging face dataset
train_ds= dataset['train']
val_ds = dataset['validation']

In [82]:
# Dataset wrapper

train_ds = TextDataset(train_ds,tokenizer,max_len)
val_ds = TextDataset(val_ds,tokenizer,max_len)

In [83]:
train_loader = DataLoader(train_ds,batch_size=4,shuffle = True)   # by calling train_ds[idx] get 4 (batch size) text rows and concat and prepare batch
val_loader = DataLoader(val_ds,batch_size=4,shuffle = True)

In [84]:
next(iter(train_loader))

[tensor([[    1,   612, 18358,  ...,     0,     0,     0],
         [    1,  9038,  2501,  ...,     0,     0,     0],
         [    1,  9038,  2501,  ...,     0,     0,     0],
         [    1,  9038,   727,  ...,     0,     0,     0]]),
 tensor([[  612, 18358, 29892,  ...,     0,     0,     0],
         [ 9038,  2501,   263,  ...,     0,     0,     0],
         [ 9038,  2501,   263,  ...,     0,     0,     0],
         [ 9038,   727,   471,  ...,     0,     0,     0]])]