# **Classify ImageNet classes with ResNet50**

In [None]:
import torch
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
model.eval()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
!unzip images.zip

Archive:  images.zip
replace images/baboon1.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: images/baboon1.jpg      
  inflating: images/baboon2.jpg      
  inflating: images/beagle1.jpg      
  inflating: images/beagle2.jpg      
  inflating: images/boxturtle1.jpg   
  inflating: images/boxturtle2.jpg   
  inflating: images/carbonara1.jpg   
  inflating: images/carbonara2.jpg   
  inflating: images/robin1.jpg       
  inflating: images/robin2.jpg       


In [None]:
import numpy as np
from PIL import Image
from torchvision import transforms

images = ["" for i in range(10)]
images[0] = "images/baboon1.jpg"
images[1] = "images/baboon2.jpg"
images[2] = "images/beagle1.jpg"
images[3] = "images/beagle2.jpg"
images[4] = "images/boxturtle1.jpg"
images[5] = "images/boxturtle2.jpg"
images[6] = "images/carbonara1.jpg"
images[7] = "images/carbonara2.jpg"
images[8] = "images/robin1.jpg"
images[9] = "images/robin2.jpg"

print(images)

['images/baboon1.jpg', 'images/baboon2.jpg', 'images/beagle1.jpg', 'images/beagle2.jpg', 'images/boxturtle1.jpg', 'images/boxturtle2.jpg', 'images/carbonara1.jpg', 'images/carbonara2.jpg', 'images/robin1.jpg', 'images/robin2.jpg']


In [None]:
#Array to store our output classifier tensors
probabilities = [[] for i in range(10)]

# We will now run through each image through the ResNet50 model
for i in range(10):
  
    image = Image.open(images[i])

  # sample execution (requires torchvision)
    preprocess = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

    # move the input and model to GPU for speed if available
    if torch.cuda.is_available():
      input_batch = input_batch.to('cuda')
      model.to('cuda')

    with torch.no_grad():
      output = model(input_batch)

    # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
    #print(output[0])
    # The output has unnormalized scores. To get probabilities, you can run a softmax on it.
    probabilities[i] = torch.nn.functional.softmax(output[0], dim=0)
    #print(probabilities)  

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [None]:
# Download ImageNet labels
!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

--2021-11-08 22:24:54--  https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10472 (10K) [text/plain]
Saving to: ‘imagenet_classes.txt.2’


2021-11-08 22:24:54 (60.3 MB/s) - ‘imagenet_classes.txt.2’ saved [10472/10472]



In [None]:
# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]

predictions = []
# Show top categories per image
for j in range(10):
  top5_prob, top5_catid = torch.topk(probabilities[j], 5)
  print('\n')
  for i in range(5):
    #prints top 5 probabilities for each image
    print(categories[top5_catid[i]], top5_prob[i].item())

print('\n')
for j in range(10):
  top_prob, top_catid = torch.topk(probabilities[j], 1)
  predictions.append(categories[top_catid[0]])
  print("{} is a {}".format(images[j],categories[top_catid]))



baboon 0.9308262467384338
guenon 0.051713727414608
macaque 0.0075723351910710335
patas 0.004145955201238394
spider monkey 0.002711112145334482


baboon 0.9265565276145935
guenon 0.022168591618537903
mongoose 0.01639094203710556
patas 0.01306508295238018
macaque 0.004343266598880291


beagle 0.53274005651474
English foxhound 0.2597598433494568
Walker hound 0.15612702071666718
basset 0.008131670765578747
Saint Bernard 0.005750237964093685


beagle 0.9708995223045349
Walker hound 0.015566343441605568
English foxhound 0.007846702821552753
basset 0.004350425209850073
bluetick 0.0005355976754799485


box turtle 0.9461554884910583
mud turtle 0.030782021582126617
terrapin 0.02285844273865223
leatherback turtle 6.119655154179782e-05
loggerhead 4.823731069336645e-05


box turtle 0.875287652015686
terrapin 0.08031152188777924
mud turtle 0.04425995424389839
loggerhead 0.00012548848462756723
leatherback turtle 1.2701329069386702e-05


carbonara 0.9974258542060852
plate 0.000661137280985713
brocco

Accuracy = (# correct)/(# trials)

In 10 trials, the classifier was correct in labelling the input images 10 times, so:

**Accuracy = 1**



Precision: fraction of detections that are correct

**Precision** = TP / (TP + FP)
          = 10 / (10 + 0)
          = **1**

**Recall** = TP / (FN + TP)
       = 10 / (0 + 10)
       = **1**




![](https://drive.google.com/uc?export=view&id=1F19HBt-sRBwz3FPsPgEX00sibSir_QLO)

In [None]:
from sklearn import metrics
actual = ['baboon', 'baboon', 'beagle', 'beagle', 'box turtle', 'box turtle', 'carbonara', 'carbonara', 
          'robin', 'robin']
pic_labels = ['baboon', 'beagle', 'box turtle', 'carbonara', 'robin']

print(metrics.confusion_matrix(actual, predictions, labels = pic_labels))
print(metrics.classification_report(actual, predictions, labels = pic_labels))

[[2 0 0 0 0]
 [0 2 0 0 0]
 [0 0 2 0 0]
 [0 0 0 2 0]
 [0 0 0 0 2]]
              precision    recall  f1-score   support

      baboon       1.00      1.00      1.00         2
      beagle       1.00      1.00      1.00         2
  box turtle       1.00      1.00      1.00         2
   carbonara       1.00      1.00      1.00         2
       robin       1.00      1.00      1.00         2

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



ResNet, short for Residual Network, is a specific type of neural network that is very useful for training large networks with hundreds or even thousands of layers. ResNets are made up of Residual Blocks, which contain a direct connection that may skip some layers in between (identity shortcut connection). These skip connections in ResNet help to solve the vanishing gradient problem in deep neural networks by allowing the shortcut path for the gradient flow through. The connections also allow the model to learn the identity functions which can ensure that the higher layer will always perform at least as good as the lower layer and never perform worse. ResNet contains many pre-trained models that can can be used to classify images, one of which (ResNet 50) I used in this problem to classify 10 images from 5 different classes. For each input image, ResNet outputted the labels with the highest probabilities of matching the image and as seen from the results above, ResNet 50 always correctly classified the input image.

For this problem, I used the following template for ResNet 50: https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/pytorch_vision_resnet.ipynb#scrollTo=bronze-variance

# **Classify MNIST classes with ResNet18**

In [1]:
from torchvision.models import resnet18
from torch import nn
from torch.utils.data import DataLoader

In [2]:
from torchvision.models.resnet import ResNet, BasicBlock
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from tqdm.autonotebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import inspect
import time
from torch import nn, optim
import torch
from torchvision.transforms import Compose, ToTensor, Normalize, Resize

In [6]:
model = resnet18(num_classes = 10)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

if torch.cuda.is_available():
  model.cuda()

# params you need to specify:
epochs = 5
train_ds = MNIST("mnist", train=True, download=True, transform=ToTensor())
val_ds = MNIST("mnist", train=False, download=True, transform=ToTensor())
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)
loss_function = nn.CrossEntropyLoss() # your loss function, cross entropy works well for multi-class problems

actual = []
predicted = []

# optimizer, I've used Adadelta, as it wokrs well without any magic numbers
optimizer = optim.Adadelta(model.parameters())

start_ts = time.time()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

losses = []
batches = len(train_loader)
val_batches = len(val_loader)

# loop for every epoch (training + evaluation)
for epoch in range(epochs):
    total_loss = 0

    # progress bar (works in Jupyter notebook too!)
    progress = tqdm(enumerate(train_loader), desc="Loss: ", total=batches)

    # ----------------- TRAINING  -------------------- 
    # set model to training
    model.train()
    
    for i, data in progress:
        X, y = data[0].to(device), data[1].to(device)
        
        # training step for single batch
        model.zero_grad()
        outputs = model(X)
        loss = loss_function(outputs, y)
        loss.backward()
        optimizer.step()

        # getting training quality data
        current_loss = loss.item()
        total_loss += current_loss

        # updating progress bar
        progress.set_description("Loss: {:.4f}".format(total_loss/(i+1)))
        
    # releasing unceseccary memory in GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # ----------------- VALIDATION  ----------------- 
    val_losses = 0
    precision, recall, f1, accuracy = [], [], [], []
    
    # set model to evaluating (testing)
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            X, y = data[0].to(device), data[1].to(device)

            outputs = model(X) # this gets the prediction from the network

            val_losses += loss_function(outputs, y)

            predicted_classes = torch.max(outputs, 1)[1] # get class from network's prediction
            
            for i in data[1]:
              actual.append(int(i))

            for j in torch.max(outputs, 1)[1]:
              predicted.append(int(j))


Loss:   0%|          | 0/938 [00:00<?, ?it/s]

Loss:   0%|          | 0/938 [00:00<?, ?it/s]

Loss:   0%|          | 0/938 [00:00<?, ?it/s]

Loss:   0%|          | 0/938 [00:00<?, ?it/s]

Loss:   0%|          | 0/938 [00:00<?, ?it/s]

In [7]:
from sklearn import metrics

category_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(metrics.confusion_matrix(actual, predicted, labels = category_labels))
print(metrics.classification_report(actual, predicted, labels = category_labels))

[[4883    0    4    1    0    1    5    4    2    0]
 [   0 5664    0    1    0    0    2    5    2    1]
 [   6   10 5091   31    2    0    1   17    2    0]
 [   0    0    3 5021    0   20    0    3    3    0]
 [   0    0    2    0 4878    0    8    0    3   19]
 [   3    0    0   31    0 4415    6    2    2    1]
 [  25   14    1    1    5   15 4715    0   14    0]
 [   0   13   16    3    3    0    0 5069    4   32]
 [   6    6    5   22    5    8   13    5 4788   12]
 [   5    2    0    5   32   23    0   21   14 4943]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4900
           1       0.99      1.00      1.00      5675
           2       0.99      0.99      0.99      5160
           3       0.98      0.99      0.99      5050
           4       0.99      0.99      0.99      4910
           5       0.99      0.99      0.99      4460
           6       0.99      0.98      0.99      4790
           7       0.99      0.99   

For this problem, I used the ResNet 18 network to classify MNIST classes. The MNIST dataset is a dataset of 60,000 small square 28x28 pixel grayscale images of handwritten single digits between 0 and 9. In order to properly classify MNIST classes with ResNet 18, I started by initializing the model similar to my approach in the first problem. Then, I defined the number of epochs for training (5) and defined a train and validation loader along with a loss function. In each epoch, the gradient is computed and a parameter update is done on the current gradient. With each epoch, the total loss decreases and after running through 5 epochs, the results above display the extremely high accuracy (99%)of the ResNet 18 network classifying MNIST classes.

For this problem, I used the following template to classify MNIST classes with ResNet 18: https://zablo.net/blog/post/using-resnet-for-mnist-in-pytorch-tutorial/