In [1]:
from src.baseline.baseline_model import MultiModalModel
from src.model.model import FoodItemTagModel
from src.baseline.dataset import MultiModalDataset as baseline_dataset
from src.model.dataset import MultiModalDataset as model_dataset

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms

In [3]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from tqdm.notebook import tqdm
import numpy as np

### Baseline validation

In [4]:
base_path = "C:\\Users\\Mercedez\\Downloads\\santhosh\\food_item_tag"
image_model_path = "src\\baseline\\alexnet_model.pth"
model_path = "src\\baseline\\multimodal_model.pth"

image_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

img_model = models.alexnet(pretrained=True)
img_model.classifier[6] = nn.Linear(4096,48)
img_model.load_state_dict(torch.load(image_model_path))
img_model.eval()
model = MultiModalModel(48, 61, 48)
model.load_state_dict(torch.load(model_path))
model.eval()

hashing = HashingVectorizer(n_features=20)
df = pd.read_csv(f"{base_path}\\data\\training_data.csv")
hashing.fit(df['name'])

validation_dataset = baseline_dataset(f"{base_path}\\data\\validation_data.csv", f"{base_path}\\imgs", image_transforms["val"], hashing)

validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=4,
                                             shuffle=True, num_workers=0)

labels = []
output = []
for image, text, label in tqdm(validation_dataloader):    
    with torch.set_grad_enabled(False):
        image_features = img_model(image)
        out = model(image_features, text)
        labels.extend(label.tolist())
        out = torch.sigmoid(out)
        output.extend(out.tolist())

print(output)

  0%|          | 0/496 [00:00<?, ?it/s]

[[0.040131088346242905, 0.014056216925382614, 0.03109816089272499, 0.021634435281157494, 0.01997128687798977, 0.03902537748217583, 0.010341018438339233, 0.013972898945212364, 0.031692054122686386, 0.22983092069625854, 0.02019142173230648, 0.01890445686876774, 0.07394103705883026, 0.0325634591281414, 0.025860127061605453, 0.0157184861600399, 0.013997728936374187, 0.05778899043798447, 0.05402638018131256, 0.7975829839706421, 0.017480460926890373, 0.013769490644335747, 0.017501765862107277, 0.7059758901596069, 0.02353898249566555, 0.019244013354182243, 0.01926119439303875, 0.05897446349263191, 0.01640922762453556, 0.023949649184942245, 0.7124746441841125, 0.025098858401179314, 0.047177739441394806, 0.17465749382972717, 0.5597023963928223, 0.02125253900885582, 0.02730761095881462, 0.02692960388958454, 0.05121694877743721, 0.07800047844648361, 0.010414744727313519, 0.10009951889514923, 0.022993315011262894, 0.027041373774409294, 0.10546589642763138, 0.0107308030128479, 0.04543191194534302, 

In [5]:
output = np.asarray(output)
output[output>=0.5]=1
output[output<0.5]=0
labels = np.asarray(labels)
print("micro F1 score", f1_score(labels,output,average='micro'))
print("macro F1 score", f1_score(labels,output,average='macro'))

micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813


### Model Evaluation

In [17]:
model_path = "src\\model\\old_model.pth"

model = FoodItemTagModel(512,512,48)
model.load_state_dict(torch.load(model_path))
model.eval()
validation_dataset = model_dataset(f"{base_path}\\data\\validation_data.csv", f"{base_path}\\imgs", image_transforms["val"])
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=24,
                                                 shuffle=True, num_workers=0)#,collate_fn=lambda x: x)

labels = []
output = []
for batch in tqdm(validation_dataloader):  
    image = batch['image']
    price = batch['price']
    label = batch['label']
    text = {}
    text['input_ids'] = batch['input_ids']
    text['attention_mask'] = batch['attention_mask']
    text['token_type_ids'] = batch['token_type_ids']
    with torch.set_grad_enabled(False):
        out = model(image,text,price)
        labels.extend(label.tolist())
        out = torch.sigmoid(out)
        output.extend(out.tolist())

print(output)

Some weights of the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/83 [00:00<?, ?it/s]

[[0.04659756273031235, 0.0038701153825968504, 0.01799198053777218, 0.0046760221011936665, 0.026441212743520737, 0.03225855529308319, 0.004579642787575722, 0.004557294771075249, 0.01714720018208027, 0.18913181126117706, 0.016705675050616264, 0.0020898955408483744, 0.06581209599971771, 0.012774704955518246, 0.012088421732187271, 0.0018970302771776915, 0.0012817552778869867, 0.05264130234718323, 0.03814408555626869, 0.8340470194816589, 0.008974947035312653, 0.007503465283662081, 0.017910784110426903, 0.7067457437515259, 0.01911955140531063, 0.0026759770698845387, 0.0038133999332785606, 0.03909539431333542, 0.006768003571778536, 0.005807207897305489, 0.7256745100021362, 0.007452226709574461, 0.030911056324839592, 0.16045790910720825, 0.5779302716255188, 0.01574454829096794, 0.017350656911730766, 0.008077770471572876, 0.04205957055091858, 0.06562378257513046, 0.002944809850305319, 0.08739365637302399, 0.01732960343360901, 0.00584891252219677, 0.0828871876001358, 0.006109231151640415, 0.0439

In [18]:
output = np.asarray(output)
outs = output.copy()
output[output>=0.5]=1
output[output<0.5]=0
labels = np.asarray(labels)
print("micro F1 score", f1_score(labels,output,average='micro'))
print("macro F1 score", f1_score(labels,output,average='macro'))

micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813


### Eval based on different Thresholds.

In [16]:
thresh = [0.2,0.3,0.4,0.5,0.6,0.7]
for tr in thresh:
    print("threshold")
    new_out = outs.copy()
    new_out[new_out>=tr]=1
    new_out[new_out<tr]=0
    print("micro F1 score", f1_score(labels,output,average='micro'))
    print("macro F1 score", f1_score(labels,output,average='macro'))
    
    

threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813
threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813
threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813
threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813
threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813
threshold
micro F1 score 0.6051925820256776
macro F1 score 0.06649770796729813


In [20]:
print(labels[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


In [22]:
print(output[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


### Model struggles in data with lesser training data even after applying class weights

very low logits for positve cases, needs more improvement on that part