In [1]:
!python --version

Python 3.9.18


In [2]:
import torch
torch.__version__

  from .autonotebook import tqdm as notebook_tqdm


'1.12.1'

In [3]:
import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
import os
class_names=['garlic_bread', 'hot_dog', 'ice_cream', 'omelette', 'pizza']
from torchvision import datasets, transforms

In [6]:
# 1. Create a class which subclasses nn.Module
class PatchEmbedding(nn.Module):
    """Turns a 2D input image into a 1D sequence learnable embedding vector.
    
    Args:
        in_channels (int): Number of color channels for the input images. Defaults to 3.
        patch_size (int): Size of patches to convert input image into. Defaults to 16.
        embedding_dim (int): Size of embedding to turn image into. Defaults to 768.
    """ 
    # 2. Initialize the class with appropriate variables
    def __init__(self, 
                 in_channels:int=3,
                 patch_size:int=16,
                 embedding_dim:int=768):
        super().__init__()
        
        # 3. Create a layer to turn an image into patches
        self.patcher = nn.Conv2d(in_channels=in_channels,
                                 out_channels=embedding_dim,
                                 kernel_size=patch_size,
                                 stride=patch_size,
                                 padding=0)

        # 4. Create a layer to flatten the patch feature maps into a single dimension
        self.flatten = nn.Flatten(start_dim=2, # only flatten the feature map dimensions into a single vector
                                  end_dim=3)

    # 5. Define the forward method 
    def forward(self, x):
        patch_size=16
        # Create assertion to check that inputs are the correct shape
        image_resolution = x.shape[-1]
        assert image_resolution % patch_size == 0, f"Input image size must be divisble by patch size, image shape: {image_resolution}, patch size: {patch_size}"
        
        # Perform the forward pass
        x_patched = self.patcher(x)
        x_flattened = self.flatten(x_patched) 
        
        # 6. Make sure the output shape has the right order 
        return x_flattened.permute(0, 2, 1) # adjust so the embedding is on the final dimension [batch_size, P^2•C, N] -> [batch_size, N, P^2•C]

In [7]:
# 1. Create a class that inherits from nn.Module
class MultiheadSelfAttentionBlock(nn.Module):
    """Creates a multi-head self-attention block ("MSA block" for short).
    """
    # 2. Initialize the class with hyperparameters from Table 1
    def __init__(self,
                 embedding_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                 num_heads:int=12, # Heads from Table 1 for ViT-Base #12
                 attn_dropout:float=0): # doesn't look like the paper uses any dropout in MSABlocks
        super().__init__()
        
        # 3. Create the Norm layer (LN)
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
        
        # 4. Create the Multi-Head Attention (MSA) layer
        self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim,
                                                    num_heads=num_heads,
                                                    dropout=attn_dropout,
                                                    batch_first=True) # does our batch dimension come first?
        
    # 5. Create a forward() method to pass the data throguh the layers
    def forward(self, x):
        x = self.layer_norm(x)
        attn_output, _ = self.multihead_attn(query=x, # query embeddings 
                                             key=x, # key embeddings
                                             value=x, # value embeddings
                                             need_weights=False) # do we need the weights or just the layer outputs?
        return attn_output

In [8]:
# import torch.nn as nn
# class BN_bnc(nn.BatchNorm1d):
#     """
#     BN_bnc: BatchNorm1d on hidden feature with (B,N,C) dimension
#     """

#     def forward(self, x):
#         B, N, C = x.shape
#         x = x.reshape(B * N, C)  # (B,N,C) -> (B*N,C)
#         x = super().forward(x)   # apply batch normalization
#         x = x.reshape(B, N, C)   # (B*N,C) -> (B,N,C)
#         return x



# 1. Create a class that inherits from nn.Module
class MLPBlock(nn.Module):
    """Creates a layer normalized multilayer perceptron block ("MLP block" for short)."""
    # 2. Initialize the class with hyperparameters from Table 1 and Table 3
    def __init__(self,
                 embedding_dim:int=768, # Hidden Size D from Table 1 for ViT-Base
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 dropout:float=0.3): # Dropout from Table 3 for ViT-Base
        super().__init__()
        
        weight_decay=1e-4
        # 3. Create the Norm layer (LN)
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
        
        # 4. Create the Multilayer perceptron (MLP) layer(s)
        self.mlp = nn.Sequential(
            nn.Linear(in_features=embedding_dim,
                      out_features=mlp_size),
#             nn.BatchNorm1d(mlp_size),
            nn.GELU(), # "The MLP contains two layers with a GELU non-linearity (section 3.1)."
            nn.Dropout(p=dropout),
            nn.Linear(in_features=mlp_size, # needs to take same in_features as out_features of layer above
                      out_features=embedding_dim), # take back to embedding_dim
#             nn.BatchNorm1d(embedding_dim),
            nn.Dropout(p=dropout) # "Dropout, when used, is applied after every dense layer.."
        )
    
    # 5. Create a forward() method to pass the data throguh the layers
    def forward(self, x):
        x = self.layer_norm(x)
        x = self.mlp(x)
        return x

In [9]:
# 1. Create a class that inherits from nn.Module
class TransformerEncoderBlock(nn.Module):
    """Creates a Transformer Encoder block."""
    # 2. Initialize the class with hyperparameters from Table 1 and Table 3
    def __init__(self,
                 embedding_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                 num_heads:int=12, # Heads from Table 1 for ViT-Base #12
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 mlp_dropout:float=0.3, # Amount of dropout for dense layers from Table 3 for ViT-Base
                 attn_dropout:float=0): # Amount of dropout for attention layers
        super().__init__()

        # 3. Create MSA block (equation 2)
        self.msa_block = MultiheadSelfAttentionBlock(embedding_dim=embedding_dim,
                                                     num_heads=num_heads,
                                                     attn_dropout=attn_dropout)
        
        # 4. Create MLP block (equation 3)
        self.mlp_block =  MLPBlock(embedding_dim=embedding_dim,
                                   mlp_size=mlp_size,
                                   dropout=mlp_dropout)
        
    # 5. Create a forward() method  
    def forward(self, x):
        
        # 6. Create residual connection for MSA block (add the input to the output)
        x =  self.msa_block(x) + x 
        
        # 7. Create residual connection for MLP block (add the input to the output)
        x = self.mlp_block(x) + x 
        
        return x
    


In [10]:
# 1. Create a ViT class that inherits from nn.Module
class ViT(nn.Module):
    """Creates a Vision Transformer architecture with ViT-Base hyperparameters by default."""
    # 2. Initialize the class with hyperparameters from Table 1 and Table 3
    def __init__(self,
                 img_size:int=224, # Training resolution from Table 3 in ViT paper
                 in_channels:int=3, # Number of channels in input image
                 patch_size:int=16, # Patch size 8
                 num_transformer_layers:int=12, # Layers from Table 1 for ViT-Base 4
                 embedding_dim:int= 768, # Hidden size D from Table 1 for ViT-Base 128
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 num_heads:int=12, # Heads from Table 1 for ViT-Base 4
                 attn_dropout:float=0.2, # Dropout for attention projection
                 mlp_dropout:float=0.2, # Dropout for dense/MLP layers 
                 embedding_dropout:float=0.2, # Dropout for patch and position embeddings
                 num_classes:int=1000): # Default for ImageNet but can customize this
        super().__init__() # don't forget the super().__init__()!
        
        # 3. Make the image size is divisble by the patch size 
        assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}."
        
        # 4. Calculate number of patches (height * width/patch^2)
        self.num_patches = (img_size * img_size) // patch_size**2
                 
        # 5. Create learnable class embedding (needs to go at front of sequence of patch embeddings)
        self.class_embedding = nn.Parameter(data=torch.randn(1, 1, embedding_dim),
                                            requires_grad=True)
        
        # 6. Create learnable position embedding
        self.position_embedding = nn.Parameter(data=torch.randn(1, self.num_patches+1, embedding_dim),
                                               requires_grad=True)
                
        # 7. Create embedding dropout value
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)
        
        # 8. Create patch embedding layer
        self.patch_embedding = PatchEmbedding(in_channels=in_channels,
                                              patch_size=patch_size,
                                              embedding_dim=embedding_dim)
        
        # 9. Create Transformer Encoder blocks (we can stack Transformer Encoder blocks using nn.Sequential()) 
        # Note: The "*" means "all"
        self.transformer_encoder = nn.Sequential(*[TransformerEncoderBlock(embedding_dim=embedding_dim,
                                                                            num_heads=num_heads,
                                                                            mlp_size=mlp_size,
                                                                            mlp_dropout=mlp_dropout) for _ in range(num_transformer_layers)])
       
        # 10. Create classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(normalized_shape=embedding_dim),
            nn.Linear(in_features=embedding_dim, 
                      out_features=num_classes)
        )
        self.softmax=torch.nn.Softmax()
    
    # 11. Create a forward() method
    def forward(self, x):
        
        # 12. Get batch size
        batch_size = x.shape[0]
        
        # 13. Create class token embedding and expand it to match the batch size (equation 1)
        class_token = self.class_embedding.expand(batch_size, -1, -1) # "-1" means to infer the dimension (try this line on its own)

        # 14. Create patch embedding (equation 1)
        x = self.patch_embedding(x)

        # 15. Concat class embedding and patch embedding (equation 1)
        x = torch.cat((class_token, x), dim=1)

        # 16. Add position embedding to patch embedding (equation 1) 
        x = self.position_embedding + x

        # 17. Run embedding dropout (Appendix B.1)
        x = self.embedding_dropout(x)

        # 18. Pass patch, position and class embedding through transformer encoder layers (equations 2 & 3)
        x = self.transformer_encoder(x)

        # 19. Put 0 index logit through classifier (equation 4)
        x = self.classifier(x[:, 0]) # run on each sample in a batch at 0 index
        
        
#         x=self.softmax(x)

        return x       

In [11]:
# Train our MOdel

# Create an instance of ViT with the number of classes we're working with (pizza, steak, sushi)
vit = ViT(num_classes=len(class_names))
vit.load_state_dict(torch.load('E:/Deep learning/Project/code_data/Image-Classification-Using-Vision-transformer-main/weights/last_2/8.pt'), strict=False)



<All keys matched successfully>

In [None]:
import torch
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt

from typing import List, Tuple

from PIL import Image
import cv2
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Predict on a target image with a target model
# Function created in: https://www.learnpytorch.io/06_pytorch_transfer_learning/#6-make-predictions-on-images-from-the-test-set
def pred_and_plot_image(
    model: torch.nn.Module,
    class_names: List[str],
    image_path: str,
    image_size: Tuple[int, int] = (224, 224),
    transform: torchvision.transforms = None,
    device: torch.device = device,
):
    """Predicts on a target image with a target model.

    Args:
        model (torch.nn.Module): A trained (or untrained) PyTorch model to predict on an image.
        class_names (List[str]): A list of target classes to map predictions to.
        image_path (str): Filepath to target image to predict on.
        image_size (Tuple[int, int], optional): Size to transform target image to. Defaults to (224, 224).
        transform (torchvision.transforms, optional): Transform to perform on image. Defaults to None which uses ImageNet normalization.
        device (torch.device, optional): Target device to perform prediction on. Defaults to device.
    """

    # Open image
    img = Image.open(image_path)


    # Create transformation for image (if one doesn't exist)
    if transform is not None:
        image_transform = transform
    else:
        image_transform = transforms.Compose(
            [
                transforms.Resize(image_size),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.556, 0.447, 0.335], std=[0.231, 0.242, 0.238]
                ),
            ]
        )

    ### Predict on image ###

    # Make sure the model is on the target device
    model.to(device)

    # Turn on model evaluation mode and inference mode
    model.eval()
    with torch.inference_mode():
        # Transform and add an extra dimension to image (model requires samples in [batch_size, color_channels, height, width])
        transformed_image = image_transform(img).unsqueeze(dim=0)

        # Make a prediction on image with an extra dimension and send it to the target device
        target_image_pred = model(transformed_image.to(device))

    # Convert logits -> prediction probabilities (using torch.softmax() for multi-class classification)
    target_image_pred_probs = torch.softmax(target_image_pred, dim=1)

    # Convert prediction probabilities -> prediction labels
    target_image_pred_label = torch.argmax(target_image_pred_probs, dim=1)

    # Plot image with predicted label and probability
#     plt.figure()
#     plt.imshow(img)
#     plt.title(
#         f"Pred: {class_names[target_image_pred_label]} | Prob: {target_image_pred_probs.max():.3f} | {target_image_pred_probs}"
#     )
#     plt.axis(False)
    return class_names[target_image_pred_label]


In [None]:
import requests


custom_image_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_test_v2/ice_cream/"

garlic_bread=0
hot_dog = 0
ice_cream= 0
omelette= 0 
pizza=0

paths=os.listdir(custom_image_path)
for i in paths:
    img_path= os.path.join(custom_image_path,i)

    # Predict on custom image
    classname = pred_and_plot_image(model=vit,
                        image_path=img_path,
                        class_names=class_names)
    
    if classname=='garlic_bread':
        garlic_bread=garlic_bread+1
    elif classname=='hot_dog':
        hot_dog=hot_dog+1
    elif classname=='ice_cream':
        ice_cream=ice_cream+1
    elif classname=='omelette':
        omelette=omelette+1
    else:
        pizza=pizza+1
        
print("Total Pizza GT=",40)
print("garlic_bread=",garlic_bread)
print("hot_dog=",hot_dog)
print("ice_cream=",ice_cream)
print("omelette=",omelette)
print("pizza=",pizza)
    
    
    

In [None]:
import requests


custom_image_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_test_v2/garlic_bread/"

garlic_bread=0
hot_dog = 0
ice_cream= 0
omelette= 0 
pizza=0

paths=os.listdir(custom_image_path)
for i in paths:
    img_path= os.path.join(custom_image_path,i)

    # Predict on custom image
    classname = pred_and_plot_image(model=vit,
                        image_path=img_path,
                        class_names=class_names)
    
    if classname=='garlic_bread':
        garlic_bread=garlic_bread+1
    elif classname=='hot_dog':
        hot_dog=hot_dog+1
    elif classname=='ice_cream':
        ice_cream=ice_cream+1
    elif classname=='omelette':
        omelette=omelette+1
    else:
        pizza=pizza+1
        
print("Total garlic_bread GT=",40)
print("garlic_bread=",garlic_bread)
print("hot_dog=",hot_dog)
print("ice_cream=",ice_cream)
print("omelette=",omelette)
print("pizza=",pizza)
    
    
    

In [None]:
import requests


custom_image_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_test_v2/omelette/"

garlic_bread=0
hot_dog = 0
ice_cream= 0
omelette= 0 
pizza=0

paths=os.listdir(custom_image_path)
for i in paths:
    img_path= os.path.join(custom_image_path,i)

    # Predict on custom image
    classname = pred_and_plot_image(model=vit,
                        image_path=img_path,
                        class_names=class_names)
    
    if classname=='garlic_bread':
        garlic_bread=garlic_bread+1
    elif classname=='hot_dog':
        hot_dog=hot_dog+1
    elif classname=='ice_cream':
        ice_cream=ice_cream+1
    elif classname=='omelette':
        omelette=omelette+1
    else:
        pizza=pizza+1
        
print("Total omelette GT=",40)
print("garlic_bread=",garlic_bread)
print("hot_dog=",hot_dog)
print("ice_cream=",ice_cream)
print("omelette=",omelette)
print("pizza=",pizza)
    
    
    

In [None]:
import requests


custom_image_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_test_v2/hot_dog/"


garlic_bread=0
hot_dog = 0
ice_cream= 0
omelette= 0 
pizza=0

paths=os.listdir(custom_image_path)
for i in paths:
    img_path= os.path.join(custom_image_path,i)

    # Predict on custom image
    classname = pred_and_plot_image(model=vit,
                        image_path=img_path,
                        class_names=class_names)
    
    if classname=='garlic_bread':
        garlic_bread=garlic_bread+1
    elif classname=='hot_dog':
        hot_dog=hot_dog+1
    elif classname=='ice_cream':
        ice_cream=ice_cream+1
    elif classname=='omelette':
        omelette=omelette+1
    else:
        pizza=pizza+1
        
print("Total hot_dog GT=",40)
print("garlic_bread=",garlic_bread)
print("hot_dog=",hot_dog)
print("ice_cream=",ice_cream)
print("omelette=",omelette)
print("pizza=",pizza)
    
    
    

## Metrics calculations

In [20]:
import torch
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt

from typing import List, Tuple

from PIL import Image
import cv2
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"


def pred_and_plot_image(
    model: torch.nn.Module,
    class_names: List[str],
    image_path: str,
    GT:str,
    image_size: Tuple[int, int] = (224, 224),
    transform: torchvision.transforms = None,
    device: torch.device = device,
):

    img = Image.open(image_path)

    if transform is not None:
        image_transform = transform
    else:
        image_transform = transforms.Compose(
            [
                transforms.Resize(image_size),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.556, 0.447, 0.335], std=[0.231, 0.242, 0.238]
                ),
            ]
        )

    model.to(device)

    model.eval()
    with torch.inference_mode():
        # Transform and add an extra dimension to image (model requires samples in [batch_size, color_channels, height, width])
        transformed_image = image_transform(img).unsqueeze(dim=0)

        # Make a prediction on image with an extra dimension and send it to the target device
        target_image_pred = model(transformed_image.to(device))

    # Convert logits -> prediction probabilities (using torch.softmax() for multi-class classification)
    target_image_pred_probs = torch.softmax(target_image_pred, dim=1)

    # Convert prediction probabilities -> prediction labels
    target_image_pred_label = torch.argmax(target_image_pred_probs, dim=1)
    print("cLASS NAMES=",class_names)
    plt.figure()
    plt.imshow(img)
    plt.title(
        f" GT:{GT} | Pred : {class_names[target_image_pred_label]} | Prob: {target_image_pred_probs.max():.3f} | {target_image_pred_probs}"
    )
    plt.axis(False)
    plt.show()

    return target_image_pred_label


In [None]:
import torch
from torchvision import transforms
from PIL import Image
import os
import matplotlib.pyplot as plt

predictions=[]

class_names = ['garlic_bread', 'hot_dog', 'ice_cream', 'omelette', 'pizza']

device = "cuda" if torch.cuda.is_available() else "cpu"





def predict(vit, img_path, class_GT):
    global predictions
  
    predicted_label = pred_and_plot_image(model=vit,
                        image_path=img_path,
                        class_names=class_names,GT=class_GT)
    
    predictions.append(predicted_label.item())
    


class_names=['garlic_bread', 'hot_dog', 'ice_cream', 'omelette', 'pizza']
# test_data_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_test_v2/"
test_data_path = "E:/Deep learning/Project/OTHER_DATA/2/archive/food-101/food-101/sub_v2/subset_train_v2/"


ground_truth=[]
for class_label, class_name in enumerate(class_names):
    class_folder_path = os.path.join(test_data_path, class_name)
    print("Class name=",class_name)
    # Loop through each image in the class folder
    for image_name in os.listdir(class_folder_path):
        image_path = os.path.join(class_folder_path, image_name)

        # Predict
        predict(vit, image_path, class_name)
        ground_truth.append(class_label)





In [None]:
print("Predictions=",predictions)
print("Ground truth =",ground_truth)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Calculate confusion matrix
conf_matrix = confusion_matrix(ground_truth, predictions)

# Calculate metrics for each class
classwise_accuracy = accuracy_score(ground_truth, predictions)
precision = precision_score(ground_truth, predictions, average=None)
recall = recall_score(ground_truth, predictions, average=None)
f1 = f1_score(ground_truth, predictions, average=None)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClass-wise Accuracy:", classwise_accuracy)
print("Precision for each class:", precision)
print("Recall for each class:", recall)
print("F1 Score for each class:", f1)

overall_accuracy = accuracy_score(ground_truth, predictions)
class_names=['garlic_bread', 'hot_dog', 'ice_cream', 'omelette', 'pizza']
print("Overall Accuracy:", overall_accuracy)