Hello All, welcome to my first public kernel :) 

I'm trying out a CNN-LSTM approach by combining @afajohn 's insightful [notebook](https://www.kaggle.com/afajohn/data-loading-starter/comments) and https://github.com/sauravraghuvanshi/Udacity-Computer-Vision-Nanodegree-Program/blob/master/project_2_image_captioning_project/model.py

Goal is to get it up and running like the following:

![](https://raw.githubusercontent.com/yunjey/pytorch-tutorial/master/tutorials/03-advanced/image_captioning/png/model.png)

It's not complete yet and I'll be updating it, excited to get feedback from the knowledgable public here!

In [None]:
import sys
import math
import cv2
import numpy as np
import pandas as pd 


import seaborn as sns

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader, sampler
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
train_labels = pd.read_csv("../input/bms-molecular-translation/train_labels.csv")
sample_sub = pd.read_csv("../input/bms-molecular-translation/sample_submission.csv")

In [None]:
train_labels['molecule'] = train_labels.InChI.apply(lambda x: x[9:])
cvec = CountVectorizer(analyzer='char', binary=True, lowercase=False)
cvec.fit(train_labels['molecule'])

TRAIN_BASE_PATH = "../input/bms-molecular-translation/train"
TEST_BASE_PATH = "../input/bms-molecular-translation/test"

BATCH_SIZE = 8
VOCAB_SIZE = len(cvec.vocabulary_)
MAX_LABEL_LEN = train_labels.molecule.apply(lambda x: len(x)).max()

In [None]:
class MoleculeDataset(Dataset):
    def __init__(self, df, dset='train'):
        super(MoleculeDataset, self).__init__
        self.df = df
        self.dset = dset
    
    def __getitem__(self, index):
        imname = self.df.image_id.iloc[index]
        if self.dset == 'train' or self.dset=='val':
            basepath = TRAIN_BASE_PATH
        else:
            basepath = TEST_BASE_PATH
            
        impath = f"{basepath}/{imname[0]}/{imname[1]}/{imname[2]}/{imname}.png"
        
        image = cv2.imread(impath).astype(np.float32)
        image = cv2.resize(image, (224,224))
        
        if self.dset == 'train' or self.dset=='val':
            label = self.df["molecule"].iloc[index]
            
            label_tensor = torch.zeros((MAX_LABEL_LEN, VOCAB_SIZE))
            for char_ix, char in enumerate(label):
                vocab_ix = cvec.vocabulary_.get(char)
                label_tensor[char_ix, vocab_ix] = 1
            return image, label, label_tensor
        else:
            return image
        
    
    def __len__(self):
        return self.df.shape[0]

In [None]:
batch_size = 64          # batch size
vocab_size = 36
embed_size = 500           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss

In [None]:
def get_loader(labels,
               mode='train',
               batch_size=1,
               num_workers=0):
    
    assert mode in ['train', 'test'], "mode must be one of 'train' or 'test'."

    dataset = MoleculeDataset(labels)
    
    data_loader = DataLoader(dataset=dataset,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers)

    return data_loader

In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        for param in resnet.parameters():
            param.requires_grad_(False)
        
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size)

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        return features
    

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        ''' Initialize the layers of this model.'''
        super().__init__()
    
        # Keep track of hidden_size for initialization of hidden state
        self.hidden_size = hidden_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.lstm = nn.LSTM(input_size=embed_size, \
                            hidden_size=hidden_size, # LSTM hidden units 
                            num_layers=1, # number of LSTM layer
                            bias=True, # use bias weights b_ih and b_hh
                            batch_first=True,  # input & output will have batch size as 1st dimension
                            dropout=0, # Not applying dropout 
                            bidirectional=False, # unidirectional LSTM
                           )

        self.linear = nn.Linear(hidden_size, vocab_size)                     

        
    def init_hidden(self, batch_size):
        """ At the start of training, we need to initialize a hidden state;
        there will be none because the hidden state is formed based on previously seen data.
        So, this function defines a hidden state with all zeroes
        The axes semantics are (num_layers, batch_size, hidden_dim)
        """
        return (torch.zeros((1, batch_size, self.hidden_size), device=self.device), \
                torch.zeros((1, batch_size, self.hidden_size), device=self.device))
               

    def forward(self, features):
        """ Define the feedforward behavior of the model """
        
        
        # Initialize the hidden state
        self.batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
#         print(f'batch_size: {self.batch_size}')
        self.hidden = self.init_hidden(self.batch_size) 

        lstm_out, self.hidden = self.lstm(features.unsqueeze(1), self.hidden) # lstm_out shape : (batch_size, MAX_LABEL_LEN, hidden_size)
#         print(f'lstm_out: {lstm_out.shape}')
#         print(f'hidden: {self.hidden[0].shape}')
        outputs = self.linear(lstm_out) # outputs shape : (batch_size, MAX_LABEL_LEN, vocab_size)
#         print(f'outputs: {outputs.shape}')
        return outputs

    ## Greedy search 
    def sample(self, inputs):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        
        
        output = []
        batch_size = inputs.shape[0] # batch_size is 1 at inference, inputs shape : (1, 1, embed_size)
        hidden = self.init_hidden(batch_size) # Get initial hidden state of the LSTM
    
        while True:
            lstm_out, hidden = self.lstm(inputs, hidden) # lstm_out shape : (1, 1, hidden_size)
            outputs = self.linear(lstm_out)  # outputs shape : (1, 1, vocab_size)
            outputs = outputs.squeeze(1) # outputs shape : (1, vocab_size)
            _, max_indice = torch.max(outputs, dim=1) # predict the most likely next word, max_indice shape : (1)
            
            output.append(max_indice.cpu().numpy()[0].item()) # storing the word predicted
            
            if (max_indice == 1):
                # We predicted the <end> word, so there is no further prediction to do
                break
            
            ## Prepare to embed the last predicted word to be the new input of the lstm
            inputs = self.word_embeddings(max_indice) # inputs shape : (1, embed_size)
            inputs = inputs.unsqueeze(1) # inputs shape : (1, 1, embed_size)
            
        return output

In [None]:
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

In [None]:
x = encoder(torch.rand((2,3,288,288)))
print(x.shape)
decoder(x).shape

In [None]:
# transform_train = transforms.Compose([ 
#     transforms.Resize(256),                          # smaller edge of image resized to 256
#     transforms.RandomCrop(224),                      # get 224x224 crop from random location
#     transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
#     transforms.ToTensor(),                           # convert the PIL Image to a tensor
#     transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
#                          (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(labels=train_labels,
                         mode='train',
                         batch_size=batch_size)

# The size of the vocabulary.

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = #TODO

params = list(decoder.parameters()) + list(encoder.embed.parameters())

optimizer = torch.optim.Adam(params = params, lr = 0.001)

In [None]:

for epoch in range(1, num_epochs+1):
    
    for i_step in range(1, 100+1):

           
        
        # Obtain the batch.
        images, _, captions = next(iter(data_loader))
        images = images.permute(0, 3, 1, 2)
        
#         print(images.shape)
#         print(captions.shape)
        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features)
        # Calculate the batch loss.
        print(f'outputs.view(-1, vocab_size): {outputs.view(-1, vocab_size).flatten().shape}')
        print(f'captions.view(-1): {captions[:,1,:].squeeze(1).flatten().shape}')
        loss = criterion(outputs.view(-1, vocab_size), captions)
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))