In [39]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from matplotlib import pyplot as plt
import time
import pandas as pd

from sentencepiece import SentencePieceProcessor

In [40]:
tokenizer_path = "tokenizer.model"
tokenizer = SentencePieceProcessor()
tokenizer.load(tokenizer_path)
vocab_size = tokenizer.vocab_size()

In [36]:
file_path = 'data/TinyStories-valid.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()
    # for line in file:
    #     # Process each line here
    #     print(f"1.{line.strip()}") 

string_to_remove = '<|endoftext|>\n'
result_list = [x for x in lines if x != string_to_remove]
lines = "".join(result_list)


In [37]:
vocab = sorted(list(set(lines)))
itos = {i:ch for i, ch in enumerate(vocab)}
stoi = {ch:i for i, ch in enumerate(vocab)}


In [47]:
MASTER_CONFIG = {
    "vocab_size": len(vocab),
}


In [49]:
dataset = torch.tensor(tokenizer.encode(lines))
dataset.shape

torch.Size([5085262])

In [61]:
## Data Loader:
from torch.utils.data import Dataset
class StoryDataset(Dataset):
    def __init__(self, data, tokenizer, context_window=100, batch_size=32):
        """ Args:
        data (str): the dataset
        vectorizer (ReviewVectorizer): vectorizer instantiated from dataset """
        self.data = data 
        self._tokenizer = tokenizer
        self.context_window = context_window
        self.batch_size = batch_size


        # the data is a large string, need to split it into train, val, and test based on periods
        # self.data = self.data.split('.')
        self.data = torch.tensor(self._tokenizer.encode(self.data))


        self.train_data = self.data[:int(.8 * len(self.data))]
        self.train_size = len(self.data[:int(.8 * len(self.data))])

        self.val_data = self.data[int(.8 * len(self.data)): int(.9 * len(self.data))]
        self.val_size = len(self.data[int(.8 * len(self.data)): int(.9 * len(self.data))])

        self.test_data = self.data[int(.9 * len(self.data)):]
        self.test_size = len(self.data[int(.9 * len(self.data)):])


        self._lookup_dict = {'train': (self.train_data, self.train_size), 
                             'val': (self.val_data, self.val_size),
                             'test': (self.test_data, self.test_size)} 
        self.set_split('train')


    @classmethod
    def load_dataset_and_tokenizer(cls, data_path, tokenizer_path):
        """Load dataset and make a new vectorizer from scratch
        Args:
        review_csv (str): location of the dataset
        Returns:
        an instance of ReviewDataset
        """
        #load and slightly preprocess the dataset
        with open(data_path, 'r') as file:
            lines = file.readlines()
        string_to_remove = '<|endoftext|>\n'
        result_list = [x for x in lines if x != string_to_remove]
        lines = "".join(result_list)
        # load the tokenizer
        tokenizer = SentencePieceProcessor()
        tokenizer.load(tokenizer_path)

        return cls(lines, tokenizer)
    
    def get_tokenizer(self):
        """ returns the vectorizer """
        return self._tokenizer
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe
        Args:
        split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_data, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
        index (int): the index to the data point
        Returns:
        a dict of the data point's features (x_data) and label (y_target)
        """
        # row = self._target_data.iloc[index]
        # review_vector = self._vectorizer.vectorize(row.review)
        # rating_index =  self._vectorizer.rating_vocab.lookup_token(row.rating)


           # pick random starting points
        ix = torch.randint(0, self._target_data.size(0) - self.context_window - 1, (self.batch_size,))
        x = torch.stack([self._target_data[i:i+self.context_window] for i in ix]).long()
        y = torch.stack([self._target_data[i+1:i+self.context_window+1] for i in ix]).long()
        return x, y
        # return {'x_data': review_vector,
        #         'y_target': rating_index}
    
    def get_num_batches(self):
        """Given a batch size, return the number of batches in the dataset
        Args:
        batch_size (int)
        Returns:
        number of batches in the dataset
        """
        return len(self) // self.batch_size

In [62]:
#generate batches:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
    ensure each tensor is on the write device location. """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device) 
        yield out_data_dict

In [63]:
dataset = StoryDataset.load_dataset_and_tokenizer(file_path, tokenizer_path)

In [83]:
out = dataset.__getitem__(0)

In [89]:
out[0]

tensor([[  526,   366, 20413,  ...,   322,  1494,   372],
        [ 6788,   322,   278,  ...,  4111,   526,  3252],
        [  278, 12580, 29880,  ..., 22804,   471,   263],
        ...,
        [29889,  2688,   437,  ...,   940,   947,   451],
        [  367, 26230, 29889,  ...,   304,  1708,   411],
        [  508,  2125,   372,  ...,   472,  1009,  1856]])