In [1]:
#### Synthetic Regression Data

In [2]:
%matplotlib inline
import random
import torch
from pytorch.d2l import torch as d2l
import random

In [3]:
class SyntheticRegressionData(d2l.DataModule):  #@save
    """Synthetic data for linear regression."""
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000,
                 batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        # torch.randn returns a tensor filled with random numbers from a standard normal distribution
        # the resulting tensor is n by len(w)
        self.X = torch.randn(n, len(w))
        # n by 1 tensor, with a noise multiplier to squish it
        noise = torch.randn(n, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

In [4]:
# the get_dataloader method which will be in the SyntheticRegressionData class, which inherits from the DataModel class. It takes batch_size, matrix of features, and vector of labels, and generates minibatches of size batch_size. Each minibatch consists of a tuple of features and labels. 
@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    if train:
        indices = list(range(0, self.num_train))
        # the examples are read in random order
        random.shuffle(indices)
    else:
        indices = list(range(self.num_train, self.num_train+self.num_val))
    for i in range(0, len(indices), self.batch_size):
        batch_indices = torch.tensor([indices[i: i+self.batch_size]])
        # yield suspends a functions execution and sends a value back to the caller. it retains its state such that when the function resumes, it continues execution immediately after the last yield run. this allows its code to produce a series of values over time, rather than computing them at once and sending them back like a list.
        yield self.X[batch_indices], self.y[batch_indices]

In [5]:
# the true parameters 
data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
# the first minibatch of data
X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)

X shape: torch.Size([1, 32, 2]) 
y shape: torch.Size([1, 32, 1])


In [6]:
# concise implementation of the dataloader

@d2l.add_to_class(d2l.DataModule) #@save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)

@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)