In [26]:
import numpy as np

class TextDataloader:
    def __init__(self, dataset, max_seq_len, batch_size, shuffle=True):
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        
        # shuffle logic vars
        self.shuffle = shuffle
        self.chunk_len = max_seq_len * batch_size
        
        # trim dataset, fix for multigpu batching bugs
        num_batches = math.ceil(len(dataset)/self.chunk_len)
        trimmed_dataset_size = (num_batches - 1) * self.chunk_len + 1
        self.dataset = dataset[0: trimmed_dataset_size]
        self.dataset_len = trimmed_dataset_size

        self.batch_order = np.array(range(num_batches))
        
        if shuffle:
            np.random.shuffle(self.batch_order)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        if self.index > len(self.batch_order) - 1:
            raise StopIteration

        i = self.batch_order[self.index]

        chunk_pos = i * self.chunk_len

        data = self.dataset[chunk_pos: chunk_pos + self.chunk_len]
        target = self.dataset[(chunk_pos) + 1: (chunk_pos + self.chunk_len) + 1]

        num_batches = min(self.batch_size, (self.dataset_len - chunk_pos) // self.max_seq_len)
        if num_batches == 0:
            raise StopIteration

        if(len(data) != len(target)):
            # remove mismatched batch sizes
            data = data.narrow(0, 0, self.max_seq_len * (num_batches - 1))
            target = target.narrow(0, 0, self.max_seq_len * (num_batches -1))

        self.index += 1

        return self.batchify(data, target, num_batches)

    
    def batchify(self, data, target, num_batches):
        # Evenly divide the data across the batch_size batches.
        data = data.view(num_batches, -1).contiguous()
        target = target.view(num_batches, -1).contiguous()

        # shuffle data
        if self.shuffle:
            permutation = torch.randperm(data.size(0))
            data = data[permutation]
            target = target[permutation]
       
        # flatten targets
        target = target.reshape(-1)
        return data, target.reshape(-1)

In [31]:
import torch
import math
import numpy as np

length = 300
batch_size = 5
max_seq_len = 10

# chunk_len = max_seq_len * batch_size

# num_chunks = math.ceil(length/ chunk_len)
# print(num_chunks)
# resized = (num_chunks - 1) * chunk_len + 1
# print(resized)
# print(resized//chunk_len)
dataset = torch.arange(0, length)
dataloader = TextDataloader(dataset, max_seq_len, batch_size, False)
for batch in dataloader:
    data, targets = batch
    print("data.shape - targets.shape: ", data.shape,  targets.shape)

    # print(data)
    # print(targets)
    # break


data.shape - targets.shape:  torch.Size([5, 10]) torch.Size([50])
data.shape - targets.shape:  torch.Size([5, 10]) torch.Size([50])
data.shape - targets.shape:  torch.Size([5, 10]) torch.Size([50])
data.shape - targets.shape:  torch.Size([5, 10]) torch.Size([50])
data.shape - targets.shape:  torch.Size([5, 10]) torch.Size([50])


In [46]:
length = 300
batch_size = 5
seq_len = 10
chunk_len = seq_len * batch_size

num_batches = math.ceil(length/chunk_len)
num_batches
# t = torch.rand(num_batches)
# print('Original Tensor:', t)

order = np.array(range(5))
np.random.shuffle(order)
print('Order:', order)
# order[]

# in-place changing of values
data[np.array(range(5))] = data[order]
# print('New Tensor:', t)

# chunk_len = seq_len * batch_size
# print(chunk_len)
# data = torch.arange(0, length)
# data = batchify(data, batch_size)
# print(data.shape)

Order: [3 4 1 2 0]


In [52]:
data

5

In [58]:
r=torch.randperm(data.size(0))
d2 = data[r]
d2


tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])