## Recall from previous notebook on linear regression using PyTorch

In [2]:
# import libraries

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import make_dot

In [4]:
# generate data 

# initialize your random seed to ensure reproducibility of your result
np.random.seed(42) 
# randomly generate x which is avector of 100 points 
x = np.random.rand(100, 1)
# define exact linear function y = 1 + 2x + epsilon where epsilon (0.1*random numbers)
# 1 = y-intercept 
# 2 = slope 
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

In [5]:
# split data into 80% train 20% validation 

idx = np.arange(100)
# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)
# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]
# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [6]:
# Devices and CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
# Convert Numpy arrays into PyTorch's Tensors, and then send them to the chosen device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

tensor([[0.7713],
        [0.0636],
        [0.8631],
        [0.0254],
        [0.7320],
        [0.0740],
        [0.1987],
        [0.3110],
        [0.4722],
        [0.9696],
        [0.1220],
        [0.7751],
        [0.8022],
        [0.7296],
        [0.0977],
        [0.1849],
        [0.1560],
        [0.0206],
        [0.9869],
        [0.6233],
        [0.7081],
        [0.5979],
        [0.9219],
        [0.6376],
        [0.2809],
        [0.2588],
        [0.1196],
        [0.7290],
        [0.9489],
        [0.6075],
        [0.5613],
        [0.4938],
        [0.1818],
        [0.2713],
        [0.9699],
        [0.2123],
        [0.1834],
        [0.8662],
        [0.3745],
        [0.2912],
        [0.8084],
        [0.0581],
        [0.8324],
        [0.5427],
        [0.7722],
        [0.8872],
        [0.0885],
        [0.0452],
        [0.5924],
        [0.6842],
        [0.7132],
        [0.0344],
        [0.6011],
        [0.8155],
        [0.4402],
        [0

In [38]:
# initial guess a and b
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [39]:
lr = 1e-1  # learning rate
n_epochs = 1000  # num of epochs

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor  # compute the predicted values yhat
    loss = loss_fn(y_train_tensor, yhat)  # track the loss
    loss.backward() # compute the gradient using auto_grad 
    optimizer.step() 
    optimizer.zero_grad() # update the parameters a and b
    
print(a, b)

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


## Model
In PyTorch, a model is represented by a regular Python class that inherits from the Module class.

The most fundamental methods are:
    
* ```__init__(self)```: it defines two parameters, a and b.

* forward(self, x): it performs the actual computation, that is, it outputs a prediction, given the input x.


You should NOT call the forward(x) method, though. You should call the whole model itself, as in model(x) to perform a forward pass and output predictions.


### build a proper (yet simple) model for our regression task.

In [40]:
class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # To make "a" and "b" real parameters of the model, we need to wrap them with nn.Parameter
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
    def forward(self, x):
        # Computes the outputs / predictions
        return self.a + self.b * x

In the ```__init__``` method, we define our two parameters, a and b, using the Parameter() class, to tell PyTorch these tensors should be considered parameters of the model they are an attribute of.

They used to be assigned as follows:

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

### get the current values for all parameters using our model’s state_dict() method.

In [41]:
torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = ManualLinearRegression().to(device)
# We can also inspect its parameters using its state_dict
print(model.state_dict())

OrderedDict([('a', tensor([0.3367])), ('b', tensor([0.1288]))])


In [42]:
lr = 1e-1  # learning rate
n_epochs = 1000  # num of epochs

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)

In [43]:
for epoch in range(n_epochs):
    
    # yhat = a + b * x_train_tensor  # compute the predicted values yhat
    # What is this?!?
    model.train()

    yhat = model(x_train_tensor)
    
    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(model.state_dict())

OrderedDict([('a', tensor([0.3367])), ('b', tensor([0.1288]))])


#### comment “What is this?!?” — model.train().

In PyTorch, models have a train() method which, somewhat disappointingly, does NOT perform a training step. Its only purpose is to set the model to training mode. 

## Nested Models
In our previous model, we manually created two parameters to perform a linear regression. That is: 
    
self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))

self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
Let’s use PyTorch’s Linear model as an attribute of our own, thus creating a nested model.


In [75]:
class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        # self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        # Instead of our custom parameters, we use a Linear layer with single input and single output
        self.linear = nn.Linear(1, 1) # 1 input feature (x value), 1 output feature (y value)
                
    def forward(self, x):

        # return self.a + self.b * x
        
        # Now it only takes a call to the layer to make predictions
        return self.linear(x)

## Sequential Models
Our model was simple enougn. You may be thinking: “why even bother to build a class for it?!” Well, you have a point. 
    
Since the output of a layer is sequentially fed as an input to the next, we can use a, er… Sequential model :-)
    
In our case, we would build a Sequential model with a single argument, that is, the Linear layer we used to train our linear regression. 

In [48]:
# Alternatively, you can use a Sequential model
model = nn.Sequential(nn.Linear(1, 1)).to(device)

#### Training Step

In [49]:
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        optimizer.zero_grad()
        # Returns the loss
        return loss.item()
    
    # Returns the function that will be called inside the train loop
    return train_step

# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)  # recurrent network 
losses = []

# For each epoch... we train ... see how tiny the training loop is now?
for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    loss = train_step(x_train_tensor, y_train_tensor)
    losses.append(loss)
    
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[-0.4869]])), ('0.bias', tensor([0.5873]))])


### Dataset

In PyTorch, a dataset is represented by a regular Python class that inherits from the Dataset class. 


The most fundamental methods are:

* ```__init__(self)``` : it takes whatever arguments needed to build a list of tuples — it may be the name of a CSV file that will be loaded and processed; it may be two tensors, one for features, another one for labels; or anything else, depending on the task at hand.

* ```__get_item__(self, index)```: it allows the dataset to be indexed, so it can work like a list (dataset[i]) — it must return a tuple (features, label) corresponding to the requested data point. We can either return the corresponding slices of our pre-loaded dataset or tensors or, as mentioned above, load them on demand (like in this example).

* ```__len__(self)```: it should simply return the size of the whole dataset so, whenever it is sampled, its indexing is limited to the actual size.


Let’s build a simple custom dataset that takes 2 tensors as arguments: 

* one for the features, 
* one for the labels. 

For any given index, our dataset class will return the corresponding slice of each of those tensors. 

In [50]:
from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [51]:
# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

In [52]:
train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))


In [53]:
len(train_data)

80

In [54]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))


In [55]:
len(train_data)

80

Did you notice we built our training tensors out of Numpy arrays but we did not send them to a device? So, they are CPU tensors now! Why?

We don’t want our whole training data to be loaded into GPU tensors, as we have been doing in our example so far, because it takes up space in our graphics card’s RAM.

### DataLoader
Until now, we have used the whole training data at every training step. It has been batch gradient descent all along. This is fine for our ridiculously small dataset (80 pair of points), sure, but if we want to go serious about all this, we must use mini-batch gradient descent. Thus, we need mini-batches. 

We use PyTorch’s DataLoader class for this job. We tell it which dataset to use (the one we just built in the previous section), the desired mini-batch size and if we’d like to shuffle it or not. That’s it!

Our loader will behave like an iterator, so we can loop over it and fetch a different mini-batch every time.

In [56]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=10, shuffle=True)

In [57]:
len(train_loader)  # total 80 points, divide into 10 bathes, each batch has 8 points 

8

In [58]:
# To retrieve a sample mini-batch, 
next(iter(train_loader))

[tensor([[0.1834],
         [0.0452],
         [0.8022],
         [0.1395],
         [0.7132],
         [0.4722],
         [0.0885],
         [0.0055],
         [0.6011],
         [0.7320]]),
 tensor([[1.4637],
         [0.9985],
         [2.6229],
         [1.3051],
         [2.6162],
         [1.9857],
         [1.0708],
         [1.0632],
         [2.1214],
         [2.4732]])]

it will return a list containing two tensors, one for the features, another one for the labels.

In [59]:
# Let's put mini-batch into the training 

losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
print(model.state_dict())

OrderedDict([('0.weight', tensor([[-0.4869]])), ('0.bias', tensor([0.5873]))])


Two things are different now: 

* we have an inner loop to load each and every mini-batch from our DataLoader 
* we are now sending only one mini-batch to the device.

### Random Split

PyTorch’s random_split() method is an easy way of performing a training-validation split. Remeber that we need to apply it to the whole dataset (not the training dataset we built). 

In [60]:
from torch.utils.data.dataset import random_split

# convert numpy array to tensor CPU
x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

# load the data CPU
dataset = TensorDataset(x_tensor, y_tensor)

# split the data into train and validation CPU
train_dataset, val_dataset = random_split(dataset, [80, 20])

# mini-batch for the train dataset
train_loader = DataLoader(dataset=train_dataset, batch_size=16)

# mini-batch for the validation dataset
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

### Evaluation (Last part YAY!)

We need to change the training loop to include the evaluation of our model, that is, computing the validation loss. 

The first step is to include another inner loop to handle the mini-batches that come from the validation loader , sending them to the same device as our model. 

Next, we make predictions using our model and compute the corresponding loss.

And there are TWO things need to consider:

* torch.no_grad(): in the validation inner loop, we shall disable any gradient calculation;

* eval(): the only thing it does is setting the model to evaluation mode (just like its train() counterpart did)

here is our new training part 

In [62]:
losses = []
val_losses = []

train_step = make_train_step(model, loss_fn, optimizer) # recurrent

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            model.eval()

            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())

print(model.state_dict())

OrderedDict([('0.weight', tensor([[-0.4869]])), ('0.bias', tensor([0.5873]))])


### the full program

In [64]:
torch.manual_seed(42)

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()


In [65]:
# Builds dataset with ALL data
dataset = TensorDataset(x_tensor, y_tensor)
# Splits randomly into train and validation datasets
train_dataset, val_dataset = random_split(dataset, [80, 20])
# Builds a loader for each dataset to perform mini-batch gradient descent
train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

In [66]:
# Builds a simple sequential model
model = nn.Sequential(nn.Linear(1, 1)).to(device)
print(model.state_dict())

OrderedDict([('0.weight', tensor([[-0.9676]])), ('0.bias', tensor([-0.5727]))])


In [67]:
# Sets hyper-parameters
lr = 1e-1
n_epochs = 150

In [68]:
# Defines loss function and optimizer
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

In [69]:
losses = []
val_losses = []
# Creates function to perform train step from model, loss and optimizer
train_step = make_train_step(model, loss_fn, optimizer)

In [71]:
# Training loop
for epoch in range(n_epochs):
    # Uses loader to fetch one mini-batch for training
    for x_batch, y_batch in train_loader:
        # NOW, sends the mini-batch data to the device
        # so it matches location of the MODEL
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        # One stpe of training
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
    # After finishing training steps for all mini-batches,
    # it is time for evaluation!
        
    # We tell PyTorch to NOT use autograd...
    with torch.no_grad(): # with statement will ensure that you never leave any resource open
        # Uses loader to fetch one mini-batch for validation
        for x_val, y_val in val_loader:
            # Again, sends data to same device as model
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            # What is that?!
            model.eval()
            # Makes predictions
            yhat = model(x_val)
            # Computes validation loss
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())


In [72]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[1.9625]])), ('0.bias', tensor([1.0147]))])


In [73]:
print(np.mean(losses))

0.028543626425166925


In [74]:
print(np.mean(val_losses))

0.008306218379487595


### Check the link for more details:

https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e