In [29]:
%matplotlib inline
import random
import torch
from d2l import torch as d2l

class SyntheticRegressionData(d2l.DataModule):  #@save
    """Synthetic data for linear regression."""
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000,
                 batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        self.X = torch.randn(n, len(w))
        noise = torch.randn(n, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise


1. **What will happen if the number of examples cannot be divided by the batch size. How would you change this behavior by specifying a different argument by using the framework's API?**

The remaining examples will form the last batch whose number is less than the batch size. If we want to drop these samples, we can set the `drop_last` attribute as `True` in `torch.utils.data.DataLoader`.

In [30]:
@d2l.add_to_class(d2l.DataModule)  #@save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size,
                                       shuffle=train, drop_last = True)
@d2l.add_to_class(SyntheticRegressionData)  #@save
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
len(data.train_dataloader())

31

2. **Suppose that we want to generate a huge dataset, where both the size of the parameter vector `w` and the number of examples `num_examples` are large.**
    1. What happens if we cannot hold all data in memory?
    1. How would you shuffle the data if it is held on disk? Your task is to design an *efficient* algorithm that does not require too many random reads or writes. Hint: [pseudorandom permutation generators](https://en.wikipedia.org/wiki/Pseudorandom_permutation) allow you to design a reshuffle without the need to store the permutation table explicitly.

<font color = red>(uncertain)</font>

A. We can not shuffle the dataset with a very long list of indices. It will still take too much memory.

B. We can generate a pseudorandom permutation by methods such as (Naor, M., & Reingold, O. (1999). On the construction of pseudorandom permutations: Luby–Rackoff revisited). It can be used to generate indices without storing the whole permutation table. (I didn't study this method carefully)

3. **Implement a data generator that produces new data on the fly, every time the iterator is called.**

In [28]:
@d2l.add_to_class(SyntheticRegressionData)  #@save
def data_generator(self): 
    if not hasattr(self, 'iter'):
        self.iter = iter(self.train_dataloader())
    return next(self.iter)
    
data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, batch_size=8)
for i in range(3):
    X,y = data.data_generator()
    print(y)

tensor([[ 7.6491],
        [ 5.3203],
        [ 1.4828],
        [-2.1962],
        [ 1.0877],
        [ 0.4515],
        [ 2.3016],
        [ 5.2006]])
tensor([[ 0.6757],
        [ 1.5848],
        [ 8.9078],
        [ 2.7623],
        [ 7.8832],
        [-7.9047],
        [ 9.1790],
        [ 2.8393]])
tensor([[-0.7600],
        [ 2.4155],
        [ 3.5540],
        [-0.9344],
        [ 2.0684],
        [11.7490],
        [ 4.8920],
        [ 7.3684]])


4. **How would you design a random data generator that generates *the same* data each time it is called?**

In [23]:
@d2l.add_to_class(SyntheticRegressionData)  #@save
def get_dataloader(self, train):
    torch.manual_seed(2) # set the random seed
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, batch_size=8)
for i in range(3):
    X, y = next(iter(data.train_dataloader()))
    print("Time", i, "y=", y)

Time 0 y= tensor([[ 4.7769],
        [ 4.4085],
        [10.4552],
        [ 5.6137],
        [ 9.8315],
        [ 6.0097],
        [ 1.0880],
        [ 1.1387]])
Time 1 y= tensor([[ 4.7769],
        [ 4.4085],
        [10.4552],
        [ 5.6137],
        [ 9.8315],
        [ 6.0097],
        [ 1.0880],
        [ 1.1387]])
Time 2 y= tensor([[ 4.7769],
        [ 4.4085],
        [10.4552],
        [ 5.6137],
        [ 9.8315],
        [ 6.0097],
        [ 1.0880],
        [ 1.1387]])
