In [1]:
import os
import torch
from PIL import Image
import numpy as np
from torchvision import models, transforms
from torch import nn
import onnxruntime as ort

In [None]:
test_loc = '../data/test'
test_samples = [os.path.join(test_loc, i) for i in os.listdir(test_loc)]

In [3]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

classes = ['aloo_gobi',
 'aloo_matar',
 'aloo_methi',
 'aloo_paratha',
 'aloo_shimla_mirch',
 'aloo_tikki',
 'amritsari_kulcha',
 'anda_curry',
 'ariselu',
 'balushahi',
 'banana_chips',
 'bandar_laddu',
 'basundi',
 'besan_laddu',
 'bhindi_masala',
 'biryani',
 'boondi',
 'boondi_laddu',
 'butter_chicken',
 'chaas',
 'chak_hao_kheer',
 'cham_cham',
 'chana_masala',
 'chapati',
 'chicken_pizza',
 'chicken_razala',
 'chicken_tikka',
 'chicken_tikka_masala',
 'chicken_wings',
 'chikki',
 'chivda',
 'chole_bhature',
 'daal_baati_churma',
 'daal_puri',
 'dabeli',
 'dal_khichdi',
 'dal_makhani',
 'dal_tadka',
 'dharwad_pedha',
 'dhokla',
 'double_ka_meetha',
 'dum_aloo',
 'falooda',
 'fish_curry',
 'gajar_ka_halwa',
 'garlic_bread',
 'gavvalu',
 'ghevar',
 'grilled_sandwich',
 'gujhia',
 'gulab_jamun',
 'hara_bhara_kabab',
 'idiyappam',
 'idli',
 'imarti',
 'jalebi',
 'kachori',
 'kadai_paneer',
 'kadhi_pakoda',
 'kaju_katli',
 'kakinada_khaja',
 'kalakand',
 'karela_bharta',
 'khakhra',
 'kheer',
 'kofta',
 'kulfi',
 'lassi',
 'ledikeni',
 'litti_chokha',
 'lyangcha',
 'maach_jhol',
 'makki_di_roti_sarson_da_saag',
 'malpua',
 'margherita_pizza',
 'masala_dosa',
 'masala_papad',
 'medu_vada',
 'misal_pav',
 'misi_roti',
 'misti_doi',
 'modak',
 'moong_dal_halwa',
 'murukku',
 'mysore_pak',
 'naan',
 'navratan_korma',
 'neer_dosa',
 'onion_pakoda',
 'palak_paneer',
 'paneer_masala',
 'paneer_pizza',
 'pani_puri',
 'paniyaram',
 'papdi_chaat',
 'patrode',
 'pav_bhaji',
 'pepperoni_pizza',
 'phirni',
 'pithe',
 'poha',
 'pongal',
 'poornalu',
 'pootharekulu',
 'puri_bhaji',
 'qubani_ka_meetha',
 'rabri',
 'rajma_chawal',
 'ras_malai',
 'rasgulla',
 'rava_dosa',
 'sabudana_khichdi',
 'sabudana_vada',
 'samosa',
 'sandesh',
 'seekh_kebab',
 'set_dosa',
 'sev_puri',
 'shankarpali',
 'sheer_korma',
 'sheera',
 'shrikhand',
 'soan_papdi',
 'solkadhi',
 'steamed_momo',
 'sutar_feni',
 'thali',
 'thukpa',
 'unni_appam',
 'uttapam',
 'vada_pav']

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


#### MobileNetV2

In [5]:
pre_process = transforms.Compose([
    transforms.Resize((232, 232)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

class FoodClassifierMobileNet(nn.Module):
    def __init__(self, size_inner=100, droprate=0.2, num_classes=131):
        super(FoodClassifierMobileNet, self).__init__()

        # load pre-trained mobilenet_v2
        self.base_model = models.mobilenet_v2(weights='IMAGENET1K_V2')

        # freeze base model parameters
        for param in self.base_model.parameters():
            param.requires_grad = False

        # remove original classifier
        self.base_model.classifier = nn.Identity()

        # add custom layers
        self.global_avg_pooling = nn.AdaptiveAvgPool2d((1,1))
        # add inner layers
        self.inner = nn.Linear(1280, size_inner)  # New inner layer
        self.relu = nn.ReLU()
        # add dropout
        self.dropout = nn.Dropout(droprate)
        self.output_layer = nn.Linear(size_inner, num_classes)

    def forward(self, x):
        x = self.base_model.features(x)
        x = self.global_avg_pooling(x)
        x = torch.flatten(x, 1)
        x = self.inner(x)
        x = self.relu(x)
        x = self.dropout(x)  # apply dropout
        x = self.output_layer(x)
        return x

mn_model = FoodClassifierMobileNet(size_inner=1000, droprate=0.4)
state_dict = torch.load('./model_checkpoints/food_mobilenet_v12_19_0.623.pth', map_location='cpu')
mn_model.load_state_dict(state_dict)
mn_model.eval() # set model mode

count = 0
for i in test_samples:
    img = Image.open(i)
    x = pre_process(img)
    batch_t = torch.unsqueeze(x, 0).to(device)
    with torch.no_grad():
        output = mn_model(batch_t.to('cpu'))
        output = dict(zip(classes, output[0]))
        output = max(output, key=output.get)
        # print(output, i, output in i)
        count += int(output in i)
test_acc = round(count*100.0/len(test_samples), 2)
print(f'Test Accuracy for MobileNetV2 : {test_acc}%')

Test Accuracy for MobileNetV2 : 37.93%


#### EfficientNet-V2-S

In [6]:
pre_process = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

class FoodClassifierEffNet(nn.Module):
    def __init__(self, size_inner=500, droprate=0.2, num_classes=131):
        super(FoodClassifierEffNet, self).__init__()

        # load pre-trained eff_net
        self.base_model = models.efficientnet_v2_s(weights='IMAGENET1K_V1')

        # freeze base model parameters
        for param in self.base_model.parameters():
            param.requires_grad = False

        # remove original classifier
        self.base_model.classifier = nn.Identity()

        # add custom layers
        self.global_avg_pooling = nn.AdaptiveAvgPool2d((1,1))
        # add inner layers
        self.inner = nn.Linear(1280, size_inner)  # New inner layer
        self.relu = nn.ReLU()
        # add dropout
        self.dropout = nn.Dropout(droprate)
        self.output_layer = nn.Linear(size_inner, num_classes)

    def forward(self, x):
        x = self.base_model.features(x)
        x = self.global_avg_pooling(x)
        x = torch.flatten(x, 1)
        x = self.inner(x)
        x = self.relu(x)
        x = self.dropout(x)  # apply dropout
        x = self.output_layer(x)
        return x

en_model = FoodClassifierEffNet(size_inner=500, droprate=0.3)
state_dict = torch.load('./model_checkpoints/food_effnet_v23_40_0.707.pth', map_location='cpu')
en_model.load_state_dict(state_dict)
en_model.eval()

count = 0
for i in test_samples:
    img = Image.open(i)
    x = pre_process(img)
    batch_t = torch.unsqueeze(x, 0).to(device)
    with torch.no_grad():
        output = en_model(batch_t.to('cpu'))
        output = dict(zip(classes, output[0]))
        output = max(output, key=output.get)
        # print(output, i, output in i)
        count += int(output in i)
test_acc = round(count*100.0/len(test_samples), 2)
print(f'Test Accuracy for EfficientNet-V2-S : {test_acc}%')

Test Accuracy for EfficientNet-V2-S : 55.17%


#### ConvNeXT-S

In [7]:
pre_process = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

class FoodClassifierConvNext(nn.Module):
    def __init__(self, size_inner=100, droprate=0.2, num_classes=131):
        super(FoodClassifierConvNext, self).__init__()

        # load pre-trained ConvNeXT-S
        self.base_model = models.convnext_small(weights='IMAGENET1K_V1')

        # freeze base model parameters
        for param in self.base_model.parameters():
            param.requires_grad = False

        # remove original classifier
        self.base_model.classifier = nn.Identity()

        # add custom layers
        self.global_avg_pooling = nn.AdaptiveAvgPool2d((1,1))
        # add inner layers
        self.inner = nn.Linear(768, size_inner)  # New inner layer
        self.relu = nn.ReLU()
        # add dropout
        self.dropout = nn.Dropout(droprate)
        self.output_layer = nn.Linear(size_inner, num_classes)

    def forward(self, x):
        x = self.base_model.features(x)
        x = self.global_avg_pooling(x)
        x = torch.flatten(x, 1)
        x = self.inner(x)
        x = self.relu(x)
        x = self.dropout(x)  # apply dropout
        x = self.output_layer(x)
        return x

cn_model = FoodClassifierConvNext(size_inner=500, droprate=0.3)
state_dict = torch.load('./model_checkpoints/food_cnext_v33_38_0.838.pth', map_location='cpu')
cn_model.load_state_dict(state_dict)
cn_model.eval()

count = 0
for i in test_samples:
    img = Image.open(i)
    x = pre_process(img)
    batch_t = torch.unsqueeze(x, 0).to(device)
    with torch.no_grad():
        output = cn_model(batch_t.to('cpu'))
        output = dict(zip(classes, output[0]))
        output = max(output, key=output.get)
        # print(output, i, output in i)
        count += int(output in i)
test_acc = round(count*100.0/len(test_samples), 2)
print(f'Test Accuracy for ConvNeXT-S : {test_acc}%')

Test Accuracy for ConvNeXT-S : 68.97%


In [8]:
## onnx file testing
session_cn = ort.InferenceSession('./onnx files/food_classifier_convnexts_v2.onnx')

input_name = session_cn.get_inputs()[0].name
output_name = session_cn.get_outputs()[0].name
print(input_name, output_name)

count = 0
for i in test_samples:
    img = Image.open(i)
    x = np.expand_dims(pre_process(img), axis=0)
    predictions = dict(zip(classes, session_cn.run([output_name], {input_name: x})[0][0]))
    output = max(predictions, key=predictions.get)
    count += int(output in i)
test_acc = round(count*100.0/len(test_samples), 2)
print(f'Test Accuracy for ConvNeXT-S : {test_acc}%')

input output
Test Accuracy for ConvNeXT-S : 65.52%


#### ResNet-152

In [9]:
pre_process = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

class FoodClassifierResNet(nn.Module):
    def __init__(self, num_classes=131, unfreeze_layers=0):
        super(FoodClassifierResNet, self).__init__()

        # load pre-trained ResNet-152
        self.base_model = models.resnet152(weights='IMAGENET1K_V2')

        # Freeze all base model parameters initially
        for param in self.base_model.parameters():
            param.requires_grad = False

        # Unfreeze specified number of 'layer' blocks from the end
        # For ResNet, layer4 is the last convolutional block, then layer3, etc.
        # unfreeze_layers=1 -> unfreeze layer4
        # unfreeze_layers=2 -> unfreeze layer4, layer3
        # unfreeze_layers=3 -> unfreeze layer4, layer3, layer2
        # unfreeze_layers=4 -> unfreeze layer4, layer3, layer2, layer1
        named_layer_blocks = ['layer4', 'layer3', 'layer2', 'layer1']

        for i in range(min(unfreeze_layers, len(named_layer_blocks))):
            layer_name = named_layer_blocks[i]
            if hasattr(self.base_model, layer_name):
                layer = getattr(self.base_model, layer_name)
                for param in layer.parameters():
                    param.requires_grad = True

        # Extract features (everything up to the adaptive average pooling, excluding the original FC layer)
        # The `base_model`'s `avgpool` is included in `features_extractor`.
        self.features_extractor = nn.Sequential(
            self.base_model.conv1,
            self.base_model.bn1,
            self.base_model.relu,
            self.base_model.maxpool,
            self.base_model.layer1,
            self.base_model.layer2,
            self.base_model.layer3,
            self.base_model.layer4,
            self.base_model.avgpool # Include the original avgpool
        )

        # add custom output layer
        # The input features to the linear layer will be from the base_model's final feature map size after avgpool
        self.output_layer = nn.Linear(self.base_model.fc.in_features, num_classes) # ResNet-152 has 2048 features before FC

    def forward(self, x):
        x = self.features_extractor(x)
        x = torch.flatten(x, 1) # Flatten the (batch_size, 2048, 1, 1) to (batch_size, 2048)
        x = self.output_layer(x)
        return x

rn_model = FoodClassifierResNet(unfreeze_layers=2)
state_dict = torch.load('./model_checkpoints/food_resnet_v42_12_0.887.pth', map_location='cpu')
rn_model.load_state_dict(state_dict)
rn_model.eval()

count = 0
for i in test_samples:
    img = Image.open(i)
    x = pre_process(img)
    batch_t = torch.unsqueeze(x, 0).to(device)
    with torch.no_grad():
        output = rn_model(batch_t.to('cpu'))
        output = dict(zip(classes, output[0]))
        # print()
        # print(output)
        output = max(output, key=output.get)
        # print(output, i, output in i)
        count += int(output in i)
test_acc = round(count*100.0/len(test_samples), 2)
print(f'Test Accuracy for ResNet-152 : {test_acc}%')

Test Accuracy for ResNet-152 : 79.31%
