# GastoML

## Mount Colab

once the account is mount check your drive route

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount("/content/drive")
# cd to a folder in your Drive - in my case is this route
%cd '/content/drive/Othercomputers/Mi portátil/gastroml/src'

Mounted at /content/drive
/content/drive/Othercomputers/Mi portátil/gastroml/src


In [3]:
# Check working directory
!ls

 args.py				   my_checkpoint.pth.tar
 build_vocab.py				   __pycache__
 data_loader.py				   runs
 demo.ipynb				   sample.py
'gastroML - ImageCaptioning.ipynb'	   test_greedy_gencaps.pkl
 gastroML-train.ipynb			   train.py
'gastroML-train -Test conv models.ipynb'   utils
 model.py				   utils_imgTocap.py
 modules


In [4]:
# imports

import os
import numpy as np
import pandas as pd
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.tensorboard import SummaryWriter

In [5]:
from utils_imgTocap import save_checkpoint, load_checkpoint, print_examples

In [6]:
!python -m spacy download en &> /dev/null

### Load dataset

1 - Load database and create vocab based on https://www.youtube.com/watch?v=9sHcLvVXsns

2 - Model based on https://www.youtube.com/watch?v=y2BaTt1fxJU

In [48]:
# begin code taken from https://www.youtube.com/watch?v=9sHcLvVXsns
spacy_en = spacy.load("en_core_web_sm")
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
    def __len__(self):
        return len(self.itos)

    def tokenize(self, text):
        return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

    def build_vocab(self, sentence_list):
        frequencies = {}
        idx = 4

        for n,sentence in enumerate(sentence_list):
            if n % 1000 == 0:
                print("Tokenizing caption %d" % n)
            for word in self.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenize(text)
        # return index of <UNK> if token not in vocab
        return [self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text]

class FoodDataset(Dataset):
    def __init__(self, root_dir, image_folder, csv_file, transform=None, freq_threshold=2):
        self.root_dir = root_dir
        self.image_folder = image_folder
        print(os.path.join(self.root_dir, csv_file))
        self.data = pd.read_csv(os.path.join(self.root_dir, csv_file))
        # filter rows and images without proper name
        self.data = self.data[self.data['image_name'] != '#NAME?']
        self.data = self.data.reset_index()
        self.transform = transform
        # get img and caption columns
        self.imgs = self.data['image_name']
        self.captions = self.data['title']
        print("image dataset size",len(self.imgs))
        # initialize vocabulary
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.captions.tolist())

    def __len__(self):
        return len(self.data)
    # get item at index
    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        #print("img_id",img_id)
        #print(os.path.join(self.root_dir+self.image_folder, img_id+".jpg"))
        img = Image.open(os.path.join(self.root_dir+self.image_folder, img_id+".jpg")).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)

        # include start and end of sentence tokens
        numeric_caption = [self.vocab.stoi["<SOS>"]]
        numeric_caption += self.vocab.numericalize(caption)
        numeric_caption.append(self.vocab.stoi["<EOS>"])

        # return tensors of image, caption, and caption length
        return img, torch.tensor(numeric_caption)

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)

        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets

def get_loader(root_folder,image_folder, annotation_file, transform, batch_size=32, num_workers=2, shuffle=True, pin_memory=True):
    dataset = FoodDataset(root_folder,image_folder, annotation_file, transform=transform)
    print("Vocab size: ", len(dataset.vocab))
    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory, collate_fn=MyCollate(pad_idx=pad_idx))

    return loader, dataset

transform = transforms.Compose(
    [
        transforms.Resize((224,224)),
        transforms.ToTensor(),
    ]
)
dataloader, dataset = get_loader(root_folder="../Kaggle data",image_folder="/Food Images/Food Images", annotation_file="Food Ingredients and Recipe Dataset with Image Name Mapping_CLEANED.csv", transform=transform)


../Kaggle data/Food Ingredients and Recipe Dataset with Image Name Mapping_CLEANED.csv
image dataset size 13463
Tokenizing caption 0
Tokenizing caption 1000
Tokenizing caption 2000
Tokenizing caption 3000
Tokenizing caption 4000
Tokenizing caption 5000
Tokenizing caption 6000
Tokenizing caption 7000
Tokenizing caption 8000
Tokenizing caption 9000
Tokenizing caption 10000
Tokenizing caption 11000
Tokenizing caption 12000
Tokenizing caption 13000
Vocab size:  2599


In [49]:
dir(dataset)
dir(dataloader)
#dir(dataloader.dataset)
[f"{x}-{y}" for x,y in enumerate(dataset.imgs) if "#name" in y.lower()]
#dataset.data#.image_name.isnull().sum()

[]

In [50]:
imag_errors = []
for idx, (imgs, captions) in enumerate(dataloader):
    print(imgs.shape)
    print(captions.shape)
    break

torch.Size([32, 3, 224, 224])
torch.Size([25, 32])


In [51]:
imag_errors

[]

In [52]:
dir(dataset.vocab)
dataset.vocab.itos[40]

'sauce'

# models

In [53]:
# beging code taken from https://www.youtube.com/watch?v=9sHcLvVXsns
# Encoder: CNN preentrenada
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False, dropout=0.5):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.dropout(features)
        features = self.bn(self.linear(features))
        return features

# Decoder: RNN
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        # print("check size forward DecoderRNN")
        # print(features.shape)
        # print(embeddings.shape)
        embeddings = torch.cat((features.unsqueeze(0), embeddings), 0)
        hiddens, _ = self.lstm(embeddings)
        hiddens = self.dropout(hiddens)
        outputs = self.linear(hiddens)
        return outputs

class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5, train_CNN=False):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size, train_CNN, dropout)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers, dropout)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs

    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None
            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                topk_values, topk_indices = torch.topk(output, k=2, dim=-1)
                second_highest_values = topk_values[:, 1]
                second_highest_indices = topk_indices[:, 1]
                # print(second_highest_indices)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)
                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break
        return [vocabulary.itos[idx] for idx in result_caption]

# training

In [54]:
def train(embed_size = 256, hidden_size = 256, num_layers = 1, learning_rate = 3e-4,num_epochs = 100, load_model = False, save_model = True):

    transform = transforms.Compose(
        [
            transforms.Resize((224,224)),
            transforms.RandomCrop((224,224)),
            transforms.ToTensor(),
        ]
    )

    dataloader, dataset = get_loader(
        root_folder="../Kaggle data",
        image_folder="/Food Images/Food Images",
        annotation_file="Food Ingredients and Recipe Dataset with Image Name Mapping_CLEANED.csv",
        transform=transform
        )

    torch.backends.cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    load_model = load_model
    save_model = save_model

    # Hyperparameters
    embed_size = embed_size
    hidden_size = hidden_size
    vocab_size = len(dataset.vocab)

    num_layers = num_layers
    learning_rate = learning_rate
    num_epochs = num_epochs

    # for tensorboard
    writer = SummaryWriter("runs/kaggledataset")
    step = 0

    # initialize model, loss etc
    model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    if load_model:
        step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
    model.train()

    for epoch in range(num_epochs):
        print("Epoch [%d/%d]" % (epoch, num_epochs))
        for idx, (imgs, captions) in enumerate(dataloader):
            if idx % 1000 == 0:
              print(f"img: {idx}")
            imgs = imgs.to(device)
            captions = captions.to(device)

            outputs = model(imgs, captions[:-1])
            # print(outputs.shape, captions.shape)
            loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1))

            # print(loss.item())
            # for tensorboard
            writer.add_scalar("Training loss", loss.item(), global_step=step)
            step += 1

            optimizer.zero_grad()
            loss.backward(loss)
            optimizer.step()

        if epoch % 5 == 0 and epoch != 0:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "step": step,
            }
            if save_model:
                save_checkpoint(checkpoint)

            print("Saved checkpoint")
            print_examples(model, device, dataset)

In [55]:
# hyperparameters
embed_size = 192
hidden_size = 192
num_layers = 1
learning_rate = 3e-4
num_epochs = 100



In [56]:
def load_checkpoint(checkpoint, model, optimizer):
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return checkpoint.get('step', 0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(dataset.vocab)
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)



RuntimeError: ignored

In [59]:
model.eval()
test_img1 = transform(
        Image.open("../data/demo_imgs/1.jpg").convert("RGB")
    ).unsqueeze(0)
test_img2 = transform(
        Image.open("../Kaggle data/Food Images/Food Images/3-ingredient-caramel-apple-hand-pies.jpg").convert("RGB")
    ).unsqueeze(0)
test_img10 = transform(
        Image.open(
            "../Kaggle data/Food Images/Food Images/berry-explosion-muffins.jpg"
        ).convert("RGB")
    ).unsqueeze(0)

In [60]:
model.caption_image(test_img10.to(device),dataset.vocab)

['santa',
 ' ',
 'crush',
 'thins',
 'fattoush',
 'roulades',
 'y',
 'manhattan',
 'y',
 'greens',
 'garlicky',
 'pomegranate',
 'piment',
 'agave',
 'bagnat',
 'wilted',
 'cob',
 'y',
 'greens',
 'garlicky',
 'pomegranate',
 'bagnat',
 'wilted',
 'cob',
 'y',
 'greens',
 'garlicky',
 'pomegranate',
 'bagnat',
 'wilted',
 'cob',
 'y',
 'greens',
 'garlicky',
 'pomegranate',
 'bagnat',
 'wilted',
 'cob',
 'y',
 'greens',
 'garlicky',
 'pomegranate',
 'piment',
 'agave',
 'bagnat',
 'wilted',
 'cob',
 'y',
 'greens',
 'garlicky']

In [None]:
train(
      embed_size = embed_size,
      hidden_size = hidden_size,
      num_layers = num_layers,
      learning_rate = learning_rate,
      num_epochs = num_epochs,
      load_model = False,
      save_model = True
      )

../Kaggle data/Food Ingredients and Recipe Dataset with Image Name Mapping_CLEANED.csv
image dataset size 13463
Tokenizing caption 0
Tokenizing caption 1000
Tokenizing caption 2000
Tokenizing caption 3000
Tokenizing caption 4000
Tokenizing caption 5000
Tokenizing caption 6000
Tokenizing caption 7000
Tokenizing caption 8000
Tokenizing caption 9000
Tokenizing caption 10000
Tokenizing caption 11000
Tokenizing caption 12000
Tokenizing caption 13000
Vocab size:  2599




Epoch [0/100]
img: 0
Epoch [1/100]
img: 0
Epoch [2/100]
img: 0
Epoch [3/100]
img: 0
Epoch [4/100]
img: 0
Epoch [5/100]
img: 0
=> Saving checkpoint
Saved checkpoint
Example 1 CORRECT: Shrimp pasta with bread
Example 1 OUTPUT: <SOS> <SOS> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Example 2 CORRECT: Scrambled eggs, toast and avocado
Example 2 OUTPUT: <SOS> <SOS> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Example 3 CORRECT: Fried eggplant with parmesan cheese
Example 3 OUTPUT: <SOS> <UNK> <UNK> <UNK> <UNK> <

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-61-210e6f15e0f3>", line 1, in <cell line: 1>
    train(
  File "<ipython-input-54-9ae49b605c57>", line 46, in train
    for idx, (imgs, captions) in enumerate(dataloader):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 630, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1345, in _next_data
    return self._process_data(data)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1371, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 694, in reraise
    raise exception
OSError: Caught OSError in DataLoader worker process 1.
Original Traceback (most recent call last):
 