In [59]:
import numpy as np

class TextDataloader:
    def __init__(self, dataset, max_seq_len, batch_size, shuffle=True):
        self.dataset = dataset
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.dataset_len = len(dataset)
        
        # shuffle logic
        self.shuffle = shuffle
        self.chunk_len = max_seq_len * batch_size
        num_batches = math.ceil(self.dataset_len/self.chunk_len)
        self.batch_order = np.array(range(num_batches))
        
        if shuffle:
            np.random.shuffle(self.batch_order)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        if self.index > len(self.batch_order) - 1:
            raise StopIteration

        i = self.batch_order[self.index]
        chunk_pos = i * self.chunk_len
        data = self.dataset[chunk_pos: chunk_pos + self.chunk_len]
        target = self.dataset[(chunk_pos) + 1: (chunk_pos + self.chunk_len) + 1]
        if(len(data) != len(target)):
            # remove mismatched batch sizes
            data = data.narrow(0, 0, self.max_seq_len * (self.batch_size - 1))
            target = target.narrow(0, 0, self.max_seq_len * (self.batch_size - 1))

        self.index += 1

        return self.batchify(data, target)

    
    def batchify(self, data, target):
        # Evenly divide the data across the batch_size batches.
        data = data.view(self.batch_size, -1).contiguous()
        target = target.view(self.batch_size, -1).contiguous()

        # shuffle data
        if self.shuffle:
            permutation = torch.randperm(data.size(0))
            data = data[permutation]
            target = target[permutation]
       
        # flatten targets
        target = target.reshape(-1)
        return data, target.reshape(-1)

In [61]:
import torch
import math
import numpy as np

length = 300
batch_size = 5
max_seq_len = 10

dataset = torch.arange(0, length)
dataloader = TextDataloader(dataset, max_seq_len, batch_size)
for batch in dataloader:
    data, targets = batch
    print("Data/Target Shapes", data.shape, targets.shape)
    print(data)
    print(targets)
    # break


Data/Target Shapes torch.Size([5, 8]) torch.Size([40])
tensor([[266, 267, 268, 269, 270, 271, 272, 273],
        [282, 283, 284, 285, 286, 287, 288, 289],
        [250, 251, 252, 253, 254, 255, 256, 257],
        [258, 259, 260, 261, 262, 263, 264, 265],
        [274, 275, 276, 277, 278, 279, 280, 281]])
tensor([267, 268, 269, 270, 271, 272, 273, 274, 283, 284, 285, 286, 287, 288,
        289, 290, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262,
        263, 264, 265, 266, 275, 276, 277, 278, 279, 280, 281, 282])
Data/Target Shapes torch.Size([5, 10]) torch.Size([50])
tensor([[180, 181, 182, 183, 184, 185, 186, 187, 188, 189],
        [150, 151, 152, 153, 154, 155, 156, 157, 158, 159],
        [190, 191, 192, 193, 194, 195, 196, 197, 198, 199],
        [170, 171, 172, 173, 174, 175, 176, 177, 178, 179],
        [160, 161, 162, 163, 164, 165, 166, 167, 168, 169]])
tensor([181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 1

In [12]:
length = 300
batch_size = 5
seq_len = 10
chunk_len = seq_len * batch_size

num_batches = math.ceil(length/chunk_len)
num_batches
# t = torch.rand(num_batches)
# print('Original Tensor:', t)

order = np.array(range(5))
np.random.shuffle(order)
print('Order:', order)
# order[]

# in-place changing of values
data[np.array(range(5))] = data[order]
# print('New Tensor:', t)

# chunk_len = seq_len * batch_size
# print(chunk_len)
# data = torch.arange(0, length)
# data = batchify(data, batch_size)
# print(data.shape)

Order: [4 2 0 3 1]


In [52]:
data

5

In [58]:
r=torch.randperm(data.size(0))
d2 = data[r]
d2


tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])