# Pytorch Resnet to get image features then LSTM with attention to generate text
Feel free to leave any comments or questions

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

        
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import PIL
from typing import Any, Callable, cast, Dict, List, Optional, Tuple
from torchvision.transforms.transforms import Compose, Normalize, Resize, ToTensor, RandomHorizontalFlip, RandomCrop
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torch.nn.functional as F


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
#import Levenshtein
#import cv2
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns
import time
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#make train and test as in https://www.kaggle.com/yasufuminakama/molecular-translation-naive-baseline
train = pd.read_csv('../input/bms-molecular-translation/train_labels.csv')
test = pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')

def get_train_file_path(image_id):
    return "../input/bms-molecular-translation/train/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

def get_test_file_path(image_id):
    return "../input/bms-molecular-translation/test/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

train['file_path'] = train['image_id'].progress_apply(get_train_file_path)
test['file_path'] = test['image_id'].progress_apply(get_test_file_path)


print(f'train.shape: {train.shape}  test.shape: {test.shape}')
display(train.head())
display(test.head())

In [None]:
#make vocab

words=set()
for st in train['InChI']:
    words.update(set(st))
len(words)

vocab=list(words)
vocab.append('<sos>')
vocab.append('<eos>')
vocab.append('<pad>')
stoi={'C': 0,')': 1,'P': 2,'l': 3,'=': 4,'3': 5,'N': 6,'I': 7,'2': 8,'6': 9,'H': 10,'4': 11,'F': 12,'0': 13,'1': 14,'-': 15,'O': 16,'8': 17,
 ',': 18,'B': 19,'(': 20,'7': 21,'r': 22,'/': 23,'m': 24,'c': 25,'s': 26,'h': 27,'i': 28,'t': 29,'T': 30,'n': 31,'5': 32,'+': 33,'b': 34,'9': 35,
 'D': 36,'S': 37,'<sos>': 38,'<eos>': 39,'<pad>': 40}
itos={item[1]:item[0] for item in stoi.items()}


def string_to_ints(string):
    l=[stoi['<sos>']]
    for s in string:
        l.append(stoi[s])
    l.append(stoi['<eos>'])
    return l
def ints_to_string(l):
    return ''.join(list(map(lambda i:itos[i],l)))

In [None]:
def pil_loader(path: str) -> Image.Image: #copied from torchvision
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')
    
def default_loader(path: str) -> Any:
    from torchvision import get_image_backend
    if get_image_backend() == 'accimage':
        return accimage_loader(path)
    else:
        return pil_loader(path)
    

    
class InputDatasetTest(Dataset):
    def __init__(self,paths,transform):
        self.paths=paths
        self.loader=default_loader
        self.transform=transform
    def __len__(self):
        return len(self.paths)
    def __getitem__(self,idx):
        sample=self.loader(self.paths[idx])
        sample=self.transform(sample)
        return sample,idx
    


In [None]:
#model adapted from https://www.kaggle.com/mdteach/image-captioning-with-attention-pytorch/data
class Attention(nn.Module):
    def __init__(self, encoder_dim,decoder_dim,attention_dim):
        super(Attention, self).__init__()
        
        self.attention_dim = attention_dim
        
        self.W = nn.Linear(decoder_dim,attention_dim)
        self.U = nn.Linear(encoder_dim,attention_dim)
        
        self.A = nn.Linear(attention_dim,1)
        
        
        
        
    def forward(self, features, hidden_state):
        u_hs = self.U(features)     #(batch_size,64,attention_dim)
        w_ah = self.W(hidden_state) #(batch_size,attention_dim)
        
        combined_states = torch.tanh(u_hs + w_ah.unsqueeze(1)) #(batch_size,64,attemtion_dim)
        
        attention_scores = self.A(combined_states)         #(batch_size,64,1)
        attention_scores = attention_scores.squeeze(2)     #(batch_size,64)
        
        
        alpha = F.softmax(attention_scores,dim=1)          #(batch_size,64)
        
        attention_weights = features * alpha.unsqueeze(2)  #(batch_size,64,features_dim)
        attention_weights = attention_weights.sum(dim=1)   #(batch_size,64)
        
        return alpha,attention_weights

class DecoderRNN(nn.Module):
    def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
        super().__init__()
        
        #save the model param
        self.vocab_size = vocab_size
        self.attention_dim = attention_dim
        self.decoder_dim = decoder_dim
        
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.attention = Attention(encoder_dim,decoder_dim,attention_dim)
        
        
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  
        self.lstm_cell = nn.LSTMCell(embed_size+encoder_dim,decoder_dim,bias=True)
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)
        
        
        self.fcn = nn.Linear(decoder_dim,vocab_size)
        self.drop = nn.Dropout(drop_prob)
        
        
    
    def forward(self, features, captions):
        
        #vectorize the caption
        embeds = self.embedding(captions)
        
        # Initialize LSTM state
        h, c = self.init_hidden_state(features)  # (batch_size, decoder_dim)
        
        #get the seq length to iterate
        seq_length = len(captions[0])-1 #Exclude the last one
        batch_size = captions.size(0)
        num_features = features.size(1)
        
        preds = torch.zeros(batch_size, seq_length, self.vocab_size).to(device)
        alphas = torch.zeros(batch_size, seq_length,num_features).to(device)
                
        for s in range(seq_length):
            alpha,context = self.attention(features, h)
            lstm_input = torch.cat((embeds[:, s], context), dim=1)
            h, c = self.lstm_cell(lstm_input, (h, c))
                    
            output = self.fcn(self.drop(h))
            
            preds[:,s] = output
            alphas[:,s] = alpha  
        
        
        return preds, alphas
    
    def generate_caption(self,features,max_len=200,itos=None,stoi=None):
        # Inference part
        # Given the image features generate the captions
        
        batch_size = features.size(0)
        h, c = self.init_hidden_state(features)  # (batch_size, decoder_dim)
        
        alphas = []
        
        #starting input
        #word = torch.tensor(stoi['<sos>']).view(1,-1).to(device)
        word=torch.full((batch_size,1),stoi['<sos>']).to(device)
        embeds = self.embedding(word)

        
        #captions = []
        captions=torch.zeros((batch_size,202),dtype=torch.long).to(device)
        captions[:,0]=word.squeeze()
        
        for i in range(202):
            alpha,context = self.attention(features, h)
            
            
            #store the apla score
            #alphas.append(alpha.cpu().detach().numpy())
            #print('embeds',embeds.shape)
            #print('embeds[:,0]',embeds[:,0].shape)
            #print('context',context.shape)
            lstm_input = torch.cat((embeds[:, 0], context), dim=1)
            h, c = self.lstm_cell(lstm_input, (h, c))
            output = self.fcn(self.drop(h))
            #print('output',output.shape)
            output = output.view(batch_size,-1)
        
            
            #select the word with most val
            predicted_word_idx = output.argmax(dim=1)
            
            #save the generated word
            #captions.append(predicted_word_idx.item())
            #print('predicted_word_idx',predicted_word_idx.shape)
            captions[:,i]=predicted_word_idx
            
            #end if <EOS detected>
            #if itos[predicted_word_idx.item()] == "<eos>":
            #    break
            
            #send generated word as the next caption
            #embeds = self.embedding(predicted_word_idx.unsqueeze(0))
            embeds=self.embedding(predicted_word_idx).unsqueeze(1)
        
        #covert the vocab idx to words and return sentence
        #return [itos[idx] for idx in captions]
        return captions
    
    
    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)  # (batch_size, decoder_dim)
        c = self.init_c(mean_encoder_out)
        return h, c
    


class EncoderCNNtrain18(nn.Module):
    def __init__(self):
        super(EncoderCNNtrain18, self).__init__()
        resnet = torchvision.models.resnet18()
        #for param in resnet.parameters():
        #    param.requires_grad_(False)
        
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        

    def forward(self, images):
        features = self.resnet(images)                                    #(batch_size,512,8,8)
        features = features.permute(0, 2, 3, 1)                           #(batch_size,8,8,512)
        features = features.view(features.size(0), -1, features.size(-1)) #(batch_size,64,512)
        #print(features.shape)
        return features
    
class EncoderDecodertrain18(nn.Module):
    def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
        super().__init__()
        self.encoder = EncoderCNNtrain18()
        self.decoder = DecoderRNN(
            embed_size=embed_size,
            vocab_size = vocab_size,
            attention_dim=attention_dim,
            encoder_dim=encoder_dim,
            decoder_dim=decoder_dim
        )
        
    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

In [None]:
embed_size=200
vocab_size = len(vocab)
attention_dim=300
encoder_dim=512
decoder_dim=300

model = EncoderDecodertrain18(
    embed_size=embed_size,
    vocab_size = vocab_size,
    attention_dim=attention_dim,
    encoder_dim=encoder_dim,
    decoder_dim=decoder_dim
)

MODEL_PATH='../input/model18train/modeltrain18_2'
model.load_state_dict(torch.load(MODEL_PATH))
model=model.to(device)

In [None]:
transform = Compose([
    #RandomHorizontalFlip(),
    Resize((256,256), PIL.Image.BICUBIC),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
dataset_test=InputDatasetTest(test['file_path'].to_numpy(),transform)
dataloader_test=DataLoader(
    dataset=dataset_test,
    batch_size=300,
    shuffle=False,
    num_workers=6)

def tensor_to_captions(ten):
    l=ten.tolist()
    ret=[]
    for ls in l:
        temp=''
        #for i in ls[1:]:
        for i in ls:
            if i==stoi['<eos>'] or i==stoi['<pad>']:
                break
            temp=temp+itos[i]
        ret.append(temp)
    return ret

#print out a caption to make sure model working correctly
model.eval()
itr=iter(dataloader_test)
#print(next(itr))
img,idx=next(itr)
print(img.shape)
print(img[0:5].shape)
features=model.encoder(img[0:5].to(device))
caps = model.decoder.generate_caption(features,stoi=stoi,itos=itos)
#caption = ''.join(caps)[:-1]
captions=tensor_to_captions(caps)
plt.imshow(img[0].numpy().transpose((1,2,0)))
print(captions)

In [None]:
model.eval()
with torch.no_grad():
    for i,batch in enumerate(dataloader_test):
        img,idx=batch[0].to(device),batch[1]
        features=model.encoder(img)
        caps=model.decoder.generate_caption(features,stoi=stoi,itos=itos)
        captions=tensor_to_captions(caps)
        test['InChI'].loc[idx]=captions
        if i%1000==0: print(i)

In [None]:
output_cols = ['image_id', 'InChI']
test[output_cols].to_csv('submission.csv',index=False)

In [None]:
test[output_cols].head()