# PyTorch Tutorial 

In [None]:
import torch
import numpy as np

In [None]:
torch.cuda.is_available()

**Contents**
<br> 1. [Tensors](#tensors)
<br> 2. [Introduction to Autograd](#autograd)
<br> 3. [A simple illustration of Backpropagation using Autograd](#backpropagation)
<br> 4. [Illustrative linear regression: A manual implementation](#linear_regression_manual)
<br> 5. [Illustrative linear regression: A Pytorch implementation](#linear_regression_pytorch)
<br> 6. [Linear Regression](#linear_regression)
<br> 7. [Logistic Regression](#logistic_regression)
<br> 8. [Batch Training: Dataset and DataLoader Classes](#dataset_dataloader)
<br> 9. [Dataset Transforms](#dataset_transforms)
<br> 10. [Softmax and Cross Entropy](#softmax_cross_entropy)
<br> 11. [Activation Functions](#activation_functions)

<a id='tensors'></a>
## 1. Tensors

Similar to NumPy where everything is based on arrays and vectors, everything is based on tensor operations in PyTorch. 
<br>Tensors can have different dimensions, 1d, 2d, 3d or higher...

**Creating Tensors**

In [None]:
x = torch.tensor(1)          # creating a simple tensor
print(x)

In [None]:
x = torch.empty(1)           # empty tensor with value not initialized
print(x)

x = torch.empty(3)           # one dimensional empty tensor, with three elements
print(x)

x = torch.empty(2, 3)        # two dimensional tensor
print(x)

x = torch.empty(2, 3, 4)     # three dimensional tensor
print(x)

In [None]:
x = torch.zeros(2, 2)         # two dimensional tensor with all zeros
print(x)
print('----------')

x = torch.ones(2, 2)          # two dimensional tensor with all ones 
print(x)
print('----------')

x = torch.rand(2, 2)          # two dimensional tensor with random values
print(x)

In [None]:
x = torch.ones(2, 2)            # size 
print(x.size())

**Datatype**

In [None]:
x = torch.ones(2, 2)                           # default datatype is float32
print(x.dtype)

x = torch.ones(2, 2, dtype = torch.int)        # we can define the data type
print(x.dtype)

x = torch.ones(2, 2, dtype = torch.double)     # double: float64
print(x.dtype)

x = torch.ones(2, 2, dtype = torch.float16)    # float16
print(x.dtype)

In [None]:
x = torch.tensor(1)                 # default dtype is int64
y = torch.tensor(1.0)               # default dytpe is float32
print(x, y)
print(x.dtype, y.dtype)

**Basic Operations**

In [None]:
x = torch.rand(2, 2)            # element wise addition
y = torch.rand(2, 2)
z = x + y

print(x)
print(y) 
print(z)

z = torch.add(x, y)             # another way for element wise addition
print(z)

In [None]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)

print(x)
print(y)
print('----------')

z = torch.sub(x, y)            # element-wise subtraction
print(z)

z = torch.mul(x, y)            # element-wise multiplication
print(z)

z = torch.div(x, y)            # element-wise division
print(z)

In [None]:
x = torch.ones(2, 2)
y = torch.zeros(2, 2)

print(y)
y.add_(x)       # in place addition: y will be updated to the elementwise sum of x and y
print(y)        # in pytorch, every function with trailing underscore does an inplace operation

**Slicing and Resizing**

In [None]:
x = torch.rand(2, 3)        # slicing

print(x)
print('-----------')

print(x[:, 0])      # all the rows but column 0 (first column): returns in row format, not in column format
print(x[1, :])      # second row and all the columns

print(x[1, 1])         # returns the element at the indices as a tensor
print(x[1, 1].item())  # we can use item() method to get the actual value, instead of the tensor
                       # can be used only when we have one element in the tensor

In [None]:
x = torch.rand(4, 4)         # reshaping 
print(x)

y = x.view(16)               # reshaped the two dimensional array into one dimension
print(y)

y = x.view(-1, 8)            # -1 lets the pytorch find the first dimension automatically
print(y)

#y = x.view(8)               # wont run because of size mismatch 
#print(y)

**Conversion to other data structures**

In [None]:
xl = [1, 2, 3, 4]               # converting list to tensor
xt = torch.tensor(xl)
print(xt)

In [None]:
x = torch.ones(5)           # converting tensor into a numpy array
print(x)
print(type(x))

y = x.numpy()
print(y)
print(type(y))

print('----------')

x.add_(1)       # we added 1 to each element of x 
print(x)
print(y)        # y also got modified as both share the same memory location (if the tensors are on cpu, not gpu)

In [None]:
a = np.ones(5)                  # converting numpy array into tensor
print(a)
print(type(a))

b = torch.from_numpy(a)
print(b)
print(type(b))

print('----------')

a += 1                          # incrementing all elements by 1
print(a)
print(b)                        # b also got modified, when modifying a

In [None]:
if torch.cuda.is_available():
    
    device = torch.device("cuda")         # specifying the cuda device
    
    x = torch.ones(5, device = device)    # creats the tensor on the gpu
    
    y = torch.ones(5)                     # second option is to first create it
    y = y.to(device)                      # and then move it gpu 
    
    z = x + y                             # this operation will be performed on gpu
    
    z.numpy()           # wont work because numpy can only handle cpu tensor, cant convert gpu tensor 
    
    z = z.to("cpu")     # so to convert the tensor to numpy, we move it to cpu

<a id='autograd'></a>
## 2.  Introduction to Autograd

Let's see how to use the autograd package of PyTorch to compute the gradients.
<br>We need to compute gradients wrt model parameters, in order to optimize the model.

In [None]:
import torch

x = torch.randn(3, requires_grad = True)   # we want to calculate gradient of some function wrt x
print(x)               # think of x as model parameter (theta), and the function as the cost function J
print(x.grad)          # whenever we do operations with this tensor, pytorch will create a computational graph

y = x + 2      # forward propagation: since we specified x with requires_grad = True,
print(y)       # Pytorch creates a gradient function, grad_fn = AddBackward (since the step is addition)
               # This function is then used in backpropagation to calculate gradients: gradient of y wrt x

z = y * y * 2
print(z)           # grad_fn is MulBackward (previous step operation is multiplication)

z = z.mean()
print(z)           # grad_fn in this case is MeanBackward

z.backward()       # calculates dz/dx: gradient of z with respect to x
print(x.grad)      # Note: no argument is required in backward() since z is a scalar

The variables for which requires_grad = True, and the variables which are functions of these variables directly or indirectly define the computation graph.

Note 1: If we do not mention required_grad = True, its default value is False, and we will not have grad_fn attribute in y and z above, and thus we will not be able to call backward function on the output. (Give it a try !)

Note 2: In z.backward(), no argument is required only when z is scalar. If z is vector, we need to pass argument. Try calculating z.backward(), without taking the mean step. It will not work as z will be vector in that case.

There are three ways in which we can prevent PyTorch from creating the gradient functions (grad_fn) and tracking the history in the computational graph.
<br>For example, during training loop, when we update our weights, then this operation should not be part of gradient computation.

In [None]:
x = torch.randn(3, requires_grad = True)
print(x)
x.requires_grad_(False)       # option 1: call requires_grad function and set it to False
print(x)                      # any function with trailing underscores make the changes in-place
print('----------')

x = torch.randn(3, requires_grad = True)
print(x)
y = x.detach()                # option 2: create new tensor by detaching the gradient part
print(y)
print('----------')

with torch.no_grad():         # option 3: wrap under with using no_grad function
    y = x + 2
    print(x)                  
    print(y)                  # even though x has requires_grad = True, y has no attribute grad_fn


Thus, if we dont want a variable which is a function of a variable with requires_grad = True to be part of computational graph, we have to use above methods 

Whenever we call the backward function, then the gradient for the tensor will be accumulated into the .grad attribute. 
<br>Let creat a dummy training example to understand this.

In [None]:
weights = torch.ones(4, requires_grad = True)

for epoch in range(3):
    
    model_output = (weights*3).sum()    # forward propagation
    model_output.backward()             # gradient computation
    print(weights.grad)
    
    weights.grad.zero_()                 # we need to use this to stop accumulation                   

<a id='backpropagation'></a>
## 3. A simple illustration of Backpropagation using Autograd

Let's do one step of forward and backward propagation to understand how it works. 
<br>We take a dummy model with one example (one input x, one output y), and one weight (one feature).

Gradient of loss wrt to weight w: dw = d(Loss)/dw = (d(Loss)/dy_hat) * (dy_hat/dw) = 2(y_hat - y) * x = 2(1 - 2) * 1 = -2

In [None]:
x = torch.tensor(1.0)       # input feature: value 1
y = torch.tensor(2.0)       # target feature: value 2

w = torch.tensor(1.0, requires_grad = True)     # weight (initial value 1): 
                                                # model parameter so we set requires_grad = True

y_hat = w * x               # forward pass
loss = (y_hat - y)**2
print(loss)

loss.backward()             # backward pass
print(w.grad)               # we have the gradient value after the first pass

The output w.grad = -2, matching with our exact calculation for this dummy problem

<a id='linear_regression_manual'></a>
## 4. Illustrative linear regression: A manual implementation

Before we learn to develop a machine learning model using PyTorch, lets do it manually first, taking the case of linear regression.
<br> Here, we manually construct the steps for prediction, loss calculation and parameter update, using Autograd package only for computing gradients.
<br> In the next step, we replace these manual updates using PyTorch utilities.

In [None]:
import torch

X = torch.tensor([1, 2, 3, 4], dtype = torch.float32)   # Our dataset is such that we expect weight w to be 2
y = torch.tensor([2, 4, 6, 8], dtype = torch.float32) 

w = torch.tensor(0.0, dtype = torch.float32, requires_grad = True)

def forward(x):                   
    return w * x

def loss(y, y_pred):
    return ((y - y_pred)**2).mean()

learning_rate = 0.01
n_epochs = 100

for epoch in range(n_epochs):
    
    y_pred = forward(X)             # forward propagation
    l = loss(y, y_pred)             # calculate loss
    l.backward()                    # backprop to calculate gradient: d(loss)/dw
    
    with torch.no_grad():            # to prevent this calculation to be part of computational graph
        w -= learning_rate * w.grad  # update weight
        
    w.grad.zero_()                     # zero gradients
      
    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}; loss: {l:.6f}; weight: {w:.6f}')

print(f'Prediction after training: f(5) = {forward(5):.3f}')

<a id='linear_regression_pytorch'></a>
## 5. Illustrative linear regression: A Pytorch implementation

Now, lets move to the next step, 
 - replacing manually computed loss and parameter updates by using Loss and Optimizer classes.
 - replacing the manually computed model prediction by implementing a PyTorch model. 

In general, training a machine learning model has four steps:
 - forward propagation to compute the prediction: we define a model for this step
 - loss computation: we define a loss function for this step
 - backpropagation for gradient computation: we just do loss.backward()
 - parameters update: we need to define an optimizer, with model parameters and learning rate 

In [None]:
import torch
import torch.nn as nn       # we import the neural network to use some functions

X = torch.tensor([[1], [2], [3], [4]], dtype = torch.float32)   # we change X, y to be 2d array
y = torch.tensor([[2], [4], [6], [8]], dtype = torch.float32)
n_samples, n_features = X.shape

# we do not need to define the weight w explicitly, as our PyTorch model knows the parameters

# model designing
input_size = n_features
output_size = n_features
model = nn.Linear(input_size, output_size)  # we define layer here: we have only one layer
                                            # we use the built-in layer, Linear
    
# loss and optimizer
learning_rate = 0.01
loss = nn.MSELoss()                 # replacing the manually defined loss, MSE: Mean Squared Error
optimizer = torch.optim.SGD(model.parameters(), lr =  learning_rate)   # defining an optimizer

# training loop
n_epochs = 1000
for epoch in range(n_epochs):
    
    y_pred = model(X)           # forward propagation
    l = loss(y, y_pred)         # calculate loss
    l.backward()                # backprop to calculate gradient: d(loss)/dw   
    optimizer.step()            # replacing the manual update weight
    
    optimizer.zero_grad()       # setting the gradient zero

    if (epoch+1) % 50 == 0:
        [w, b] = model.parameters()       # unpack the parameters, w is list of lists
        print(f'epoch: {epoch+1}; loss: {l:.6f}; weight: {w[0][0]:.6f}')

X_test = torch.tensor([5], dtype = torch.float32)
print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')

**Custom Model**

In the above implementation, we did not have to come up with a model for ourselves. We needed only one layer, and it was provided by PyTorch. 
<br> In most cases, we will need to build a custom model. Let's implement the code again by building a custom model.

Let's first define the class for custom model

In [None]:
import torch
import torch.nn as nn   

class LinearRegression(nn.Module):                      # deriving from nn.Module
    
    def __init__(self, input_dim, output_dim):
        
        super(LinearRegression, self).__init__()
        self.lin = nn.Linear(input_dim, output_dim)     # define our layers
        
    def forward(self, x):
        
        return self.lin(x)

Now, we define our model by using this custom defined class

In [None]:
X = torch.tensor([[1], [2], [3], [4]], dtype = torch.float32)
y = torch.tensor([[2], [4], [6], [8]], dtype = torch.float32)
n_samples, n_features = X.shape

# model designing
input_size = n_features
output_size = n_features
model = LinearRegression(input_size, output_size)    # defining the model 

# loss and optimizer
learning_rate = 0.01
loss = nn.MSELoss()             
optimizer = torch.optim.SGD(model.parameters(), lr =  learning_rate)   

# training loop
n_epochs = 1000
for epoch in range(n_epochs):
    
    y_pred = model(X)           # forward propagation
    l = loss(y, y_pred)         # calculate loss
    l.backward()                # backprop to calculate gradient: d(loss)/dw   
    optimizer.step()            # replacing the manual update weight
    
    optimizer.zero_grad()       # setting the gradient zero

    if (epoch+1) % 50 == 0:
        [w, b] = model.parameters()
        print(f'epoch: {epoch+1}; loss: {l:.6f}; weight: {w[0][0]:.6f}')

X_test = torch.tensor([5], dtype = torch.float32)
print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')

<a id='linear_regression'></a>
## 6. Linear Regression

Now, lets build a regression model using datasets provided by sklearn.

In [None]:
import torch
import torch.nn as nn
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline

# generating regression data
X_numpy, y_numpy = datasets.make_regression(n_samples = 100, 
                                            n_features = 1, 
                                            noise = 20, 
                                            random_state = 1)

X = torch.from_numpy(X_numpy.astype(np.float32))  # converting numpy array into tensors
y = torch.from_numpy(y_numpy.astype(np.float32))
y = y.view(y.shape[0], 1)                         # convert y into one column (it is one row)

print(X.shape, y.shape)

n_samples, n_features = X.shape

# model designing
input_size = n_features
output_size = 1
model = nn.Linear(input_size, output_size)

# loss and optimizer
learning_rate = 0.01
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

# training loop
n_epochs = 100
for epoch in range(n_epochs):
    
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if (epoch+1) % 10 == 0:
        print(f'epoch = {epoch+1}; loss = {loss.item():.4f}')
        
        
predicted = model(X).detach().numpy()
plt.plot(X_numpy, y_numpy, 'ro')
plt.plot(X_numpy, predicted, 'b')
plt.show()

<a id='logistic_regression'></a>
## 7. Logistic Regression

Let's build a logistic regression model to classify breast tumor 

In [None]:
import torch
import torch.nn as nn
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

bc = datasets.load_breast_cancer()        # generating logistic regression data
X, y = bc.data, bc.target

n_samples, n_features = X.shape
print(n_samples, n_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

sc = StandardScaler()                 # to scale our features having zero mean and unit variance
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

print('X shape: ', X_train.shape, X_test.shape)

print('y shape: ', y_train.shape, y_test.shape)

y_train = y_train.view(y_train.shape[0], 1)     # making the tensor two dimensional
y_test = y_test.view(y_test.shape[0], 1)

print('y shape: ', y_train.shape, y_test.shape)

# model designing: here we write our own class 
class LogisticRegression(nn.Module):
    
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
        
    def forward(self, x):
        y_predicted = torch.sigmoid(self.linear(x))
        return y_predicted

model = LogisticRegression(n_features)

# loss and optimizer
learning_rate = 0.01
criterion = nn.BCELoss()            # BCE: Binary Cross Entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

# training loop
n_epochs = 100
for epoch in range(n_epochs):
    
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}; loss = {loss.item():.4f}')
        
with torch.no_grad():                   # to avoid this part of the computational graph
    y_pred = model(X_test)
    y_pred_class = y_pred.round()
    accuracy = y_pred_class.eq(y_test).sum()/float(y_test.shape[0])
    print(f'accuracy = {accuracy:.4f}')

<a id='dataset_dataloader'></a>
## 8. Batch Training: Dataset and DataLoader Classes

If we use our whole training data at each step for optimizing our model, the computation will be time consuming. 
<br>So, a better way to deal with large datasets is to divide the whole sample into small batches and optimize our model on batches one at a time. 
<br>For this purpose, we have Dataset and DataLoader classes in PyTorch.

Let's define some terms for clarity:
 - epoch: one forward and backward pass of all training samples
 - batch_size: number of training samples in one forward and backward pass
 - number of iterations: number of passes, each pass using [batch size] number of samples
 - For example, with 100 training samples, and batch_size = 20, we have 100/20 = 5 iterations for one epoch
 
Info about the dataset wine.csv:
 - first column is class labels: three categories of wine labeled 1, 2, 3
 - rest of the columns are features

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

# implementing our custom dataset
class WineDataset(Dataset):              # inherits Dataset
    
    def __init__(self):                  # load dataset, split into x and y
        xy = np.loadtxt('wine.csv', delimiter = ",", dtype = np.float32, skiprows = 1)
        self.x = torch.from_numpy(xy[:, 1:])  # all the rows, all the columns except first one
        self.y = torch.from_numpy(xy[:, [0]]) # all the rows, first column only
        self.n_samples = xy.shape[0]
            
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    def __len__(self):
        return self.n_samples

dataset = WineDataset()
print('feature/target shape:', dataset.x.size(), dataset.y.size())
print('----------')

# verifying the WineDataset class
first_data = dataset[0]              # get the first row
print(first_data)                    # tuple of two tensors: feature and target
features, labels = first_data        # unpack
print(features, labels)
print('----------')

batch_size = 4
dataloader = DataLoader(dataset = dataset, 
                        batch_size = batch_size, 
                        shuffle = True, 
                        num_workers = 2)
datatiter = iter(dataloader)                 # converting the object to an interator
data = datatiter.next()                      # obtaining the next batch of data     
features, labels = data                      # unpack
print(features, labels)                      # four examples, as our batch_size is 4

# let's create a dummy training loop to iterate over the whole dataloader
n_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size)

for epoch in range(n_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i+1) % 5 == 0:
            print(f'epoch: {epoch+1}/{n_epochs}, step: {i+1}/{n_iterations}, inputs: {inputs.shape} ')

<a id='dataset_transforms'></a>
## 9. Dataset Transforms


Transforms help us in doing tranformation on datasets. PyTorch has a lot of inbuilt transform classes which we can use. Additionally, we can define our own custom transformations.

In the previous section, we implemented a custom WineDataset. Let's extend this class to support data transformation, and define custom tranforms.

In [None]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np

class WineDataset(Dataset):
    
    def __init__(self, transform = None):
        xy = np.loadtxt('wine.csv', delimiter = ',', dtype = np.float32, skiprows = 1)
        self.x = xy[:, 1:]       # we do not convert x, y to tensors
        self.y = xy[:, [0]]
        self.n_samples = xy.shape[0]
        self.transform = transform
        
    def __getitem__(self, index):
        sample = self.x[index], self.y[index]
        if self.transform:                         # apply transformation if available
            sample = self.transform(sample)
        return sample
    
    def __len__(self):
        return self.n_samples
    
# defining a custom transform class 
class ToTensor:                                    # class to transform to tensors
    
    def __call__(self, sample):
        inputs, targets = sample
        return torch.from_numpy(inputs), torch.from_numpy(targets)
    
# defining another custom tranform class
class MulTransform:                                # multiplication transform
    
    def __init__(self, factor):
        self.factor = factor
        
    def __call__(self, sample):
        inputs, target = sample
        inputs *= self.factor
        return inputs, target
    
dataset = WineDataset()                            # not calling the ToTensor transform
first_data = dataset[0]
features, labels = first_data
print(type(features), type(labels))                # class numpy.ndarray
print(features)
print('----------')

dataset = WineDataset(transform = ToTensor())      # calling the ToTensor transform 
first_data = dataset[0] 
features, labels = first_data
print(type(features), type(labels))                # class: torch.Tensor
print(features)
print('----------')

composed = torchvision.transforms.Compose([ToTensor(), MulTransform(2)])  # defining a composed transform
dataset = WineDataset(transform = composed)
first_data = dataset[0] 
features, labels = first_data
print(type(features), type(labels))
print(features)                           # each feature value got doubled

<a id='softmax_cross_entropy'></a>
## 10. Softmax and Cross-Entropy 

Now, let's discuss softmax function and the cross-entropy loss, most commonly used functions in neural networks.

**Softmax**

In [None]:
import torch
import numpy as np

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x), axis = 0)

x = np.array([2.0, 1.0, 0.1])             # using numpy
outputs = softmax(x)
print(outputs)
print('----------')

x = torch.tensor([2.0, 1.0, 0.1])         # using torch             
outputs = torch.softmax(x, dim = 0)
print(outputs)

**Cross-Entropy Loss**

Now, let's discuss cross-entropy loss, which measures the performance in multi-class classification problems. 
The loss increases as the predicted probability diverges from the actual label.

Let's assume we have one sample, and three classes

In [None]:
import torch
import torch.nn as nn
import numpy as np

def cross_entropy(predicted, actual):
    
    loss = -np.sum(actual * np.log(predicted))
    return loss

y = np.array([1, 0, 0])                             # must be one-hot encoded: three classes here
y_pred_good = np.array([0.6590, 0.2424, 0.0986])    # numbers after applying the softmax
y_pred_bad = np.array([0.1, 0.3, 0.6])
loss_good = cross_entropy(y_pred_good, y)
loss_bad = cross_entropy(y_pred_bad, y)
print('Numpy: ', loss_good, loss_bad)

# In PyTorch implementation, there are two differences
# first, we provide the correct class label, not the one-hot encoded
# second, softmax is not needed, it is already implemented

loss = nn.CrossEntropyLoss()
y = torch.tensor([0])                # correct class label which is 0, not the one-hot encoded
y_pred_good = torch.tensor([[2.0, 1.0, 0.1]])   # CrossEntropyLoss already has inbuilt softmax
y_pred_bad = torch.tensor([[0.5, 2.0, 0.3]])    # we pass the output without applying the softmax
loss_good = loss(y_pred_good, y)    # y_pred dimension should be n_samples * n_classes
loss_bad = loss(y_pred_bad, y)
print('PyTorch:', loss_good.item(), loss_bad.item())

# to get the actual predictions
_, prediction1 = torch.max(y_pred_good, 1)         # 1: along the first dimensions
_, prediction2 = torch.max(y_pred_bad, 1)
print(prediction1, prediction2)                    # chooses the class with highest probability

The loss in PyTorch allows for multiple samples. Lets assume we have 3 samples and 3 classes

In [None]:
import torch
import torch.nn as nn

loss = nn.CrossEntropyLoss()

y = torch.tensor([2, 0, 1])                   # the actual labels for the three samples
y_pred = torch.tensor([[0.1, 1.0, 2.0], 
                       [2.0, 1.0, 0.1], 
                       [0.1, 2.0, 1.0]])      # highest value for the correct labels

closs = loss(y_pred, y)
_, predictions = torch.max(y_pred, 1)
print(closs, predictions)

**Note:** In multi-class classification problem, we use nn.CrossEntropyLoss() at the end of the neural network to compute the loss. Since nn.CrossEntropyLoss() automatically implements the softmax, we do not implement the softmax layer before the loss calculation, to convert the numbers into probabilities.

However, in case of binary classification, we use nn.BCELoss() to calculate the loss. In binary classification, we have only one node in the end, and we apply sigmoid function on the output of this node, and then feed it to nn.BCELoss(). The loss function nn.BCELoss() does not automatically implement the sigmoid function.

<a id='activation_functions'></a>
## 11. Activation Functions

Some of the most popular activation functions are:
 - Sigmoid: Typically used in the last layer of a binary classification problem
 - Tanh: A good choice for hidden layers
 - ReLU: Most popular choice for hidden layers
 - Leaky ReLU: Improved version of ReLU; tries to solve the vanishing gradient problem
 - Softmax: Good in last layer in multiclass classification problem
 
Let's write an illustrative code showing how to use these activation functions

In [None]:
import torch
import torch.nn as nn

class NeuralNet(nn.Module):
    
    def __init__(self, input_size, hidden_size):               # we define all the layers
        super(NeuralNet, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)      # first linear layer
        self.relu = nn.ReLU()                                  # ReLU activation function
        self.linear2 = nn.Linear(hidden_size, 1)               # next linear layer
        self.sigmoid = nn.Sigmoid()                            # Sigmoid: next activation function 
        
    # In the forward pass, we call the above functions after each other
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.sigmoid(out)
        return out

Another way to write the above code is by using the activation functions directly in forward pass

In [None]:
import torch
import torch.nn as nn

class NeuralNet(nn.Module):
    
    def __init__(self, input_size, hidden_size):               # we define all the layers
        super(NeuralNet, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)      # first linear layer
        self.linear2 = nn.Linear(hidden_size, 1)               # next linear layer
        
    # In the forward pass, we call the above functions after each other
    def forward(self, x):
        out = torch.relu(self.linear1(x))
        out = torch.sigmoid(self.linear2(out))
        return out

Both of these codes work same, its just a matter of taste which way one writes.