In [1]:
from data import init_datasets, _init_corpora
from typing import List, Dict, Tuple
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [15]:
time_steps = 7
batch_size = 3
data = torch.LongTensor(list(range(1, 75)))

In [16]:
data

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
        73, 74])

In [17]:
class Sequence_Data(Dataset):
    def __init__(self, x:torch.LongTensor, y:torch.LongTensor):
        self.x = x
        self.y = y
        self.len = x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return self.len

In [18]:
def _generate_io_sequences(sequence: np.ndarray, time_steps: int) -> Tuple:
    """
    :param sequence: sequence of integer representation of words
    :param time_steps: number of time steps in LSTM cell
    :return: Tuple of torch tensors of shape (n, time_steps)
    """
    sequence = torch.LongTensor(sequence)

    # from seq we generate 2 copies.
    inputs, targets = sequence, sequence[1:]

    # split seq into seq of of size time_steps
    inputs = torch.split(tensor=inputs, split_size_or_sections=time_steps)
    targets = torch.split(tensor=targets, split_size_or_sections=time_steps)

    # note: word2index['<pad>'] = 0
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)

    return (inputs_padded, targets_padded)

In [19]:
def _old_generate_io_sequences(data:np.ndarray, time_steps:int) -> Tuple:# -> List[Tuple]:
    """
    :param data: sequence of integer representation of words
    :param time_steps: number of time steps in LSTM cell
    :return: Tuple of torch tensors of shape (n, time_steps, 1)
    """
    data = torch.LongTensor(data)
    # split tensor into tensors of of size time_steps
    data = torch.split(tensor=data, split_size_or_sections=time_steps)

    # note: word2index['<pad>'] = 0
    sequences = pad_sequence(data, batch_first=True, padding_value=0)

    # from seq we generate 2 copies.
    # inputs=seq[:-1], targets=seq[1:]
    sequences_inputs = sequences.narrow_copy(1, 0, sequences.shape[1] - 1)
    sequences_targets = sequences.narrow_copy(1, 1, sequences.shape[1] - 1)

    return (sequences_inputs, sequences_targets)

In [20]:
i, t = _generate_io_sequences(data, 7)
i2, t2 = _old_generate_io_sequences(data, 7)

In [21]:
print(i)
print(i2)

tensor([[ 1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19, 20, 21],
        [22, 23, 24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33, 34, 35],
        [36, 37, 38, 39, 40, 41, 42],
        [43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56],
        [57, 58, 59, 60, 61, 62, 63],
        [64, 65, 66, 67, 68, 69, 70],
        [71, 72, 73, 74,  0,  0,  0]])
tensor([[ 1,  2,  3,  4,  5,  6],
        [ 8,  9, 10, 11, 12, 13],
        [15, 16, 17, 18, 19, 20],
        [22, 23, 24, 25, 26, 27],
        [29, 30, 31, 32, 33, 34],
        [36, 37, 38, 39, 40, 41],
        [43, 44, 45, 46, 47, 48],
        [50, 51, 52, 53, 54, 55],
        [57, 58, 59, 60, 61, 62],
        [64, 65, 66, 67, 68, 69],
        [71, 72, 73, 74,  0,  0]])


In [22]:
print(t)
print(t2)

tensor([[ 2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20, 21, 22],
        [23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36],
        [37, 38, 39, 40, 41, 42, 43],
        [44, 45, 46, 47, 48, 49, 50],
        [51, 52, 53, 54, 55, 56, 57],
        [58, 59, 60, 61, 62, 63, 64],
        [65, 66, 67, 68, 69, 70, 71],
        [72, 73, 74,  0,  0,  0,  0]])
tensor([[ 2,  3,  4,  5,  6,  7],
        [ 9, 10, 11, 12, 13, 14],
        [16, 17, 18, 19, 20, 21],
        [23, 24, 25, 26, 27, 28],
        [30, 31, 32, 33, 34, 35],
        [37, 38, 39, 40, 41, 42],
        [44, 45, 46, 47, 48, 49],
        [51, 52, 53, 54, 55, 56],
        [58, 59, 60, 61, 62, 63],
        [65, 66, 67, 68, 69, 70],
        [72, 73, 74,  0,  0,  0]])


In [30]:
num_batches = len(i) // batch_size

In [31]:
len(i)

11

In [32]:
num_batches*batch_size

9

In [33]:
i[:9]

tensor([[ 1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19, 20, 21],
        [22, 23, 24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33, 34, 35],
        [36, 37, 38, 39, 40, 41, 42],
        [43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56],
        [57, 58, 59, 60, 61, 62, 63]])

In [36]:
inputs = i[:num_batches*batch_size]
targets = t[:num_batches*batch_size]
inputs2 = i2[:num_batches*batch_size]
targets2 = t2[:num_batches*batch_size]

In [38]:
print(inputs)
print(inputs2)

tensor([[ 1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19, 20, 21],
        [22, 23, 24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33, 34, 35],
        [36, 37, 38, 39, 40, 41, 42],
        [43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56],
        [57, 58, 59, 60, 61, 62, 63]])
tensor([[ 1,  2,  3,  4,  5,  6],
        [ 8,  9, 10, 11, 12, 13],
        [15, 16, 17, 18, 19, 20],
        [22, 23, 24, 25, 26, 27],
        [29, 30, 31, 32, 33, 34],
        [36, 37, 38, 39, 40, 41],
        [43, 44, 45, 46, 47, 48],
        [50, 51, 52, 53, 54, 55],
        [57, 58, 59, 60, 61, 62]])


In [39]:
print(targets)
print(targets2)

tensor([[ 2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20, 21, 22],
        [23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36],
        [37, 38, 39, 40, 41, 42, 43],
        [44, 45, 46, 47, 48, 49, 50],
        [51, 52, 53, 54, 55, 56, 57],
        [58, 59, 60, 61, 62, 63, 64]])
tensor([[ 2,  3,  4,  5,  6,  7],
        [ 9, 10, 11, 12, 13, 14],
        [16, 17, 18, 19, 20, 21],
        [23, 24, 25, 26, 27, 28],
        [30, 31, 32, 33, 34, 35],
        [37, 38, 39, 40, 41, 42],
        [44, 45, 46, 47, 48, 49],
        [51, 52, 53, 54, 55, 56],
        [58, 59, 60, 61, 62, 63]])


In [40]:
dataset = Sequence_Data(x=inputs, y=targets)
dataset2 = Sequence_Data(x=inputs2, y=targets2)

In [42]:
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
data_loader2 = DataLoader(dataset=dataset2, batch_size=batch_size, shuffle=True)

In [43]:
x = next(iter(data_loader))
x[0]

torch.Size([3, 7])

In [47]:
print(f"batch size : {batch_size}")
print(f"input sequences : {x[0]}")
print(f"target sequences : {x[1]}")

batch size : 3
input sequences : tensor([[36, 37, 38, 39, 40, 41, 42],
        [29, 30, 31, 32, 33, 34, 35],
        [50, 51, 52, 53, 54, 55, 56]])
target sequences : tensor([[37, 38, 39, 40, 41, 42, 43],
        [30, 31, 32, 33, 34, 35, 36],
        [51, 52, 53, 54, 55, 56, 57]])


In [48]:
x2 = next(iter(data_loader2))
x2[0]

tensor([[36, 37, 38, 39, 40, 41],
        [15, 16, 17, 18, 19, 20],
        [50, 51, 52, 53, 54, 55]])

In [49]:
print(f"batch size : {batch_size}")
print(f"input sequences : {x2[0]}")
print(f"target sequences : {x2[1]}")

batch size : 3
input sequences : tensor([[36, 37, 38, 39, 40, 41],
        [15, 16, 17, 18, 19, 20],
        [50, 51, 52, 53, 54, 55]])
target sequences : tensor([[37, 38, 39, 40, 41, 42],
        [16, 17, 18, 19, 20, 21],
        [51, 52, 53, 54, 55, 56]])


In [23]:
def _intlist_to_dataloader(data:np.ndarray, time_steps:int, batch_size:int) -> DataLoader:
    """
    :param data: input list of integers
    :param batch_size: hyper parameter, for minibatch size
    :param time_steps: hyper parameter for sequence length for bptt
    :return: DataLoader for SGD
    """
    # given int list, generate input and output sequences of length = time_steps
    inputs, targets = _old_generate_io_sequences(sequence=data, time_steps=time_steps)
    
    # cut off any data that will create incomplete batches
    num_batches = len(inputs) // batch_size
    inputs = inputs[:num_batches*batch_size]
    targets = targets[:num_batches*batch_size]
    
    # create Dataset object
    dataset = Sequence_Data(x=inputs, y=targets)

    # create dataloader
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

    return data_loader