In [1]:
import torch

In [2]:
print(torch.__version__) # to check the pytorch version

2.2.2


In [3]:
print(torch.backends.mps.is_available()) # to check if the mps is available

True


In [4]:
# creating pytorch tensors

tensor0d = torch.tensor(1)
print(tensor0d)

tensor1d = torch.tensor([1,2,3])
print(tensor1d)

tensor2d = torch.tensor([[1,2,3],[4,5,6]])
print(tensor2d)

tensor3d = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
print(tensor3d)

tensor(1)
tensor([1, 2, 3])
tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [5]:
# to check the dataype of the tensor
tensor0d.dtype

torch.int64

In [6]:
float_tensor = torch.tensor([1.0,2.0])

In [7]:
# when the values are int then the default type is always torch.int64, when the values are float the default is torch.float32
print(float_tensor.dtype)

torch.float32


In [8]:
# to change the datatype we use .to function
tensor0d_float = tensor0d.to(torch.float32)
print(tensor0d_float.dtype)

torch.float32


In [9]:
# to get the shape we use .shape
tensor2d.shape

torch.Size([2, 3])

In [10]:
# to change the shape of the tensor we .reshape and we can also use the .view
tensor2d_reshaped = tensor2d.reshape([3,2])

In [11]:
tensor2d_reshaped.shape

torch.Size([3, 2])

In [12]:
# .T is used to get the transpose of a matrix
m1 = torch.tensor([[1,2],[3,4]])
m2 = m1.T

print("Orginal matrix")
print(m1)
print("transposed matrix")
print(m2)

Orginal matrix
tensor([[1, 2],
        [3, 4]])
transposed matrix
tensor([[1, 3],
        [2, 4]])


In [13]:
# to do matrix multiplication we can use .matmul and also @
m3 = m2.matmul(m1)
m4 = m2@m1
print(m3, "matrix multiplication using matmul")
print(m4, "matrix multiplication using @")

tensor([[10, 14],
        [14, 20]]) matrix multiplication using matmul
tensor([[10, 14],
        [14, 20]]) matrix multiplication using @


#### Computing Gradients in Torch

In [14]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = w1*x1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a,y)

gradient_loss_w1 = grad(loss,w1,retain_graph=True)
gradient_loss_b = grad(loss,b,retain_graph=True)

In [15]:
print(gradient_loss_w1, " This is the gradient of loss W.R.T w1")
print(gradient_loss_b, " This is the gradient of loss W.R.T b")

(tensor([-0.0898]),)  This is the gradient of loss W.R.T w1
(tensor([-0.0817]),)  This is the gradient of loss W.R.T b


#### The above process we have done everything manually it is useful for debugging but pytroch has something very simple

In [16]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = w1*x1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a,y)

loss.backward()
print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


### Code implementing a classic multilayer perceptron with two hidden layers.

1) We use the torch.nn.Module to build our own architecture.
2) We use the init constructor to define the network layers and forward method to see how the inputs pass and interact.
3) We use the .backwards method to inside of the training loop to calculate the gradients.

In [17]:
class NeuralNetwork(torch.nn.Module):

    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            torch.nn.Linear(20, num_outputs),
        )

    def forward(self,x):
        logits = self.layers(x)
        return logits

In [18]:
model = NeuralNetwork(50,3)

In [19]:
print(model) # this is used to see the summary of the model

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [20]:
# To check the number of trainable parameter's

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

2213


In [21]:
# NeuralNetwork(
#   (layers): Sequential(
#     (0): Linear(in_features=50, out_features=30, bias=True)
#     (1): ReLU()
#     (2): Linear(in_features=30, out_features=20, bias=True)
#     (3): ReLU()
#     (4): Linear(in_features=20, out_features=3, bias=True)
#   )
# )

# To access the paramerters for any layer the above network 
print(model.layers[0].weight)

Parameter containing:
tensor([[ 0.0329,  0.0665,  0.0982,  ...,  0.1395,  0.1274, -0.0592],
        [-0.1240, -0.0373, -0.0909,  ..., -0.1108, -0.0905, -0.0663],
        [-0.0901, -0.1250,  0.0210,  ..., -0.0275, -0.0608, -0.1197],
        ...,
        [ 0.0226,  0.0968, -0.0249,  ..., -0.0012,  0.0424, -0.0984],
        [ 0.0661,  0.0175,  0.1297,  ...,  0.0275,  0.1058,  0.1107],
        [-0.1247,  0.0631, -0.0927,  ..., -0.0347, -0.0386,  0.0465]],
       requires_grad=True)


In [22]:
print(model.layers[0].weight.shape)

torch.Size([30, 50])


In [23]:
# For reproducability purposes we can use manual_seed

torch.manual_seed(123)
model = NeuralNetwork(50,3)
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)


In [24]:
x = torch.rand((1,50))
out = model(x)
print(out)

tensor([[-0.1670,  0.1001, -0.1219]], grad_fn=<AddmmBackward0>)


In [25]:
# why we use this during inference is because with grad it will slow down the process. This tells pytroch to not keep track of the 
# gradients hence faster inference.

with torch.no_grad():
    out = model(x)
print(out)

tensor([[-0.1670,  0.1001, -0.1219]])


In [26]:
# we used softmax to get the class membership

with torch.no_grad():
    out = torch.softmax(model(x), dim=1)
print(out)

tensor([[0.2983, 0.3896, 0.3121]])


## Setting up Efficient Data Loaders
1) The custom dataset class is used to instantiate objects that define how each data record is loaded
2) The dataloader class is used to assemble and shuffle the data into batches etc.

In [27]:
# Creating a toy dataset
x_train = torch.tensor([
    [-1.2,3.1],
    [-0.9,2.9],
    [-0.5,2.6],
    [2.3,-1.1],
    [2.7,-1.5]
])

y_train = torch.tensor([0,0,0,1,1])

x_test = torch.tensor([
    [-0.8,2.8],
    [2.6,-1.6]
])

y_test = torch.tensor([0,1])

## Custom dataset class
1) The init constructor is used to take the features and labels it maybe filepath, objects, database connectors etc
2) The getitem method is used to return exactly one feature and it's corresponding label at a particular index.
3) The len method is used to return the length of you train or test dataset.

In [28]:
# Custom dataset class
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self,x,y):
        self.features = x
        self.labels = y

    def __getitem__(self,index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(x_train, y_train)
test_ds = ToyDataset(x_test, y_test)

In [29]:
print(len(train_ds))

5


In [30]:
# DataLoader
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset = train_ds,
    batch_size=2,
    shuffle = True,
    num_workers=0,
)

test_loader = DataLoader(
    dataset = test_ds,
    batch_size = 2,
    shuffle = False,
    num_workers=0,
)

In [31]:
for idx, (x,y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x,y)

Batch 1: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 3: tensor([[ 2.7000, -1.5000]]) tensor([1])


In [32]:
## In the above we can see in the 3rd batch there is only one tensor but we need to 2, because the training dataset has odd number of 
## Records it will not help in convering of training, hence we will drop the last batch, drop_last = True.

train_loader = DataLoader(
    dataset = train_ds,
    batch_size=2,
    shuffle = True,
    num_workers= 0, # This is used for parallel data preprocessing, if it is zero then the model has to wait for the data batch to be 
    # available, if it is greater than one the the x,y is always ready and in queue since multiple workers are working on processing data.
    drop_last= True
)

In [33]:
for idx, (x,y) in enumerate(train_loader):
    print(f"Batch {idx+1}:",x,y)

Batch 1: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 2: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])


## Training Loop

In [34]:
import torch.nn.functional as F

torch.manual_seed(123)
model = NeuralNetwork(num_inputs = 2, num_outputs = 2)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.5)

num_epochs = 3

for epoch in range(num_epochs):

    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad() # this is used to prevent gradient accumulation
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
             f" | {batch_idx:03d}/{len(train_loader):03d}"
             f" | Train Loss: {loss:.2f}"
        )

    # model.eval() this is optional

Epoch: 001/003 | 000/002 | Train Loss: 0.75
Epoch: 001/003 | 001/002 | Train Loss: 0.65
Epoch: 002/003 | 000/002 | Train Loss: 0.44
Epoch: 002/003 | 001/002 | Train Loss: 0.13
Epoch: 003/003 | 000/002 | Train Loss: 0.03
Epoch: 003/003 | 001/002 | Train Loss: 0.00


### Why do we need to add model.train() and model.eval()
Ans) Becuase the model behaves differently while training and during evaluation, while training we need dropouts and normalizations but while
inference or testing we do not need those and it is always a better practice to include them in our code, to avoid unexpected behaviour.

## Evaluation Loop

In [35]:
model.eval()
with torch.no_grad():
    outputs = model(x_train)

print(outputs)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])


In [36]:
# to obtain the class membership probabilities we apply softmax
torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim=1)
print(probas)

tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])


In [37]:
predictions = torch.argmax(probas, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


## Computer Accuracy

In [38]:
def compute_accuracy(model, dataloader):
    model = model.eval()
    correct = 0.0
    total_examples = 0.0

    for idx, (features,labels) in enumerate(dataloader):
        with torch.no_grad():
            logits = model(features)

        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct = correct + torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item()

In [39]:
print(compute_accuracy(model, train_loader))

1.0


In [40]:
print(compute_accuracy(model, test_loader))

1.0


## Saving and loading models

In [41]:
torch.save(model.state_dict(), "model.pth")
# what does state_dict() do is that it will map the layers with respect to their trainable parameters

In [42]:
model.load_state_dict(torch.load("model.pth"))
# what does load_state_dict do is that it will apply the parameters obtained from torch.load to the model.

<All keys matched successfully>

## Pytorch Computations on a GPU

In [43]:
print(torch.backends.mps.is_available())

True


In [44]:
tensor1 = torch.tensor([1,2,3,4])
tensor2 = torch.tensor([1,2,3,4,])
print(tensor1 + tensor2)

tensor([2, 4, 6, 8])


In [45]:
# We use .to() function to place these tensors on to the mps or gpu device.
tensor1 = torch.tensor([1,2,3,4]).to("mps")
tensor2 = torch.tensor([1,2,3,4]).to("mps")
print(tensor1 + tensor2)

tensor([2, 4, 6, 8], device='mps:0')


### Training loop on single GPU

In [46]:
torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr = 0.5)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()

    for idx, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)
        logits = model(features)
        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
             f" | {batch_idx:03d}/{len(train_loader):03d}"
             f" | Train Loss: {loss:.2f}"
        )

Epoch: 001/003 | 001/002 | Train Loss: 0.75
Epoch: 001/003 | 001/002 | Train Loss: 0.65
Epoch: 002/003 | 001/002 | Train Loss: 0.44
Epoch: 002/003 | 001/002 | Train Loss: 0.13
Epoch: 003/003 | 001/002 | Train Loss: 0.03
Epoch: 003/003 | 001/002 | Train Loss: 0.00


In [47]:
# Excerise A.4 to compare the matrix multiplication timings on gpu and cpu
a = torch.rand(100,200).to(torch.float32)
b = torch.rand(100,200).to(torch.float32).T
%timeit a @ b

8.87 µs ± 37 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [48]:
a_mps = torch.rand(100,200).to(torch.float32).to("mps")
b_mps = torch.rand(100,200).to(torch.float32).to("mps").T
%timeit a_mps @ b_mps

21.6 µs ± 252 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Training with multiple GPU'S
##### Q) Why do we need multiple GPU'S
Ans) We can reduce the training time significantly, it is useful in model development stage because we have to finetune the model
architecture and also parameters.

In [51]:
# DDP (distributed data parellel) :- it is used to split the input data across multiple available devices and process the subsets of the
# data simultaneously.
# Let's consider we have 2 gpu's each of the gpu will have model, and then in every training iteration each model will receive a mini-batch 
# from the data loader we use DistributedSampler(to avoid non-duplplication of the minbatches).