I had few minutes today to get into competition and peek at the data. 

### Data Loader

I used torch dataloading utility to iterate over images and corresponding labels. Right now I am **NOT** using `torchvision.transforms` for resizing images in a batch into standardized size. It is embedded in loader.

Choose the batch you want to visualize also you can change batch size.

### Label Representation

Each label is a series of characters. I use a `sklearn` `CountVectorizer` to build a vocabulary of unique characters that make up our labels. I make sure I strip `InChI=1S/` from all labels which is the same for all instances. Uniquness of the labels start after that sequence. Later on, each label can be represesnted as a binary 2D `torch.Tensor` with shape `(MAX_LABEL_LENGTH, VOCAB_SIZE)`. `c`'th element of the label is character in the vocabulary with an index `c_ix`. `c`'th row's `char_ix` column is set to 1.


[Label Tensor](#intLink)


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import cv2
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_row', 500)

In [None]:
train_labels = pd.read_csv("../input/bms-molecular-translation/train_labels.csv")
sample_sub = pd.read_csv("../input/bms-molecular-translation/sample_submission.csv")

In [None]:
train_labels['molecule'] = train_labels.InChI.apply(lambda x: x[9:])

In [None]:
train_labels.shape[0], len(train_labels.molecule.unique())

In [None]:
cvec = CountVectorizer(analyzer='char', binary=True, lowercase=False)
cvec.fit(train_labels['molecule'])

In [None]:
TRAIN_BASE_PATH = "../input/bms-molecular-translation/train"
TEST_BASE_PATH = "../input/bms-molecular-translation/test"

BATCH_SIZE = 64
VOCAB_SIZE = len(cvec.vocabulary_)
MAX_LABEL_LEN = train_labels.molecule.apply(lambda x: len(x)).max()

In [None]:
class MoleculeDataset(Dataset):
    def __init__(self, df, dset='train'):
        super(MoleculeDataset, self).__init__
        self.df = df
        self.dset = dset
    
    def __getitem__(self, index):
        imname = self.df.image_id.iloc[index]
        if self.dset == 'train' or self.dset=='val':
            basepath = TRAIN_BASE_PATH
        else:
            basepath = TEST_BASE_PATH
            
        impath = f"{basepath}/{imname[0]}/{imname[1]}/{imname[2]}/{imname}.png"
        
        image = cv2.imread(impath)
        image = cv2.resize(image, (288,288))
        
        if self.dset == 'train' or self.dset=='val':
            label = self.df["molecule"].iloc[index]
            
            label_tensor = torch.zeros((MAX_LABEL_LEN, VOCAB_SIZE))
            for char_ix, char in enumerate(label):
                vocab_ix = cvec.vocabulary_.get(char)
                label_tensor[char_ix, vocab_ix] = 1
            return image, label, label_tensor
        else:
            return image
        
    
    def __len__(self):
        return self.df.shape[0]

In [None]:
mol_train = MoleculeDataset(train_labels)
mol_test = MoleculeDataset(sample_sub, "test")

trainloader = DataLoader(mol_train, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [None]:
VISUALIZE_BATCH = 10

In [None]:
for b_ix, batch in enumerate(trainloader):
    if b_ix == VISUALIZE_BATCH:
        break

In [None]:
label_tensors = []
fig, ax = plt.subplots(nrows=BATCH_SIZE, figsize=(60, 30))
for i, (img, label, label_tensor) in enumerate(zip(batch[0], batch[1], batch[2])):
    ax[i].imshow(img)
    ax[i].set_title(label)
    label_tensors.append(label_tensor)


### Train Label Stats

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.distplot(train_labels.InChI.apply(lambda x: len(x)), axlabel='Label Length', ax=ax)

### A Label Tensor

<div id="intLink">
</div>

In [None]:
pd.DataFrame(label_tensor.numpy(), 
             columns = list(dict(sorted(cvec.vocabulary_.items(), key=lambda item: item[1])).keys()))

### Image Stats

In [None]:
class ImageDataset(Dataset):
    def __init__(self, df, dset='train'):
        super(ImageDataset, self).__init__
        self.df = df
        self.dset = dset
    
    def __getitem__(self, index):
        imname = self.df.image_id.iloc[index]
        if self.dset == 'train' or self.dset=='val':
            basepath = TRAIN_BASE_PATH
        else:
            basepath = TEST_BASE_PATH
            
        impath = f"{basepath}/{imname[0]}/{imname[1]}/{imname[2]}/{imname}.png"
        
        image = cv2.imread(impath)
        return np.array([image[:,:,0].std(), image[:,:,1].std(), image[:,:,2].std()])/255, np.array([image[:,:,0].mean(), image[:,:,1].mean(), image[:,:,2].mean()])/255
        
    def __len__(self):
        return self.df.shape[0]

In [None]:
mol_train = ImageDataset(train_labels)
trainloader = DataLoader(mol_train, batch_size=64, shuffle=False, num_workers=4)

In [None]:
batch_means = []
batch_stds = []

means = []
stds = []

for ix, batch in tqdm(enumerate(trainloader), total=len(mol_train)//64):
    batch_means.append(batch[1].numpy().mean(axis=0))
    batch_stds.append(batch[0].numpy().mean(axis=0))
    if ix % 100:
        means.append(np.vstack(batch_means).mean(axis=0))
        stds.append(np.vstack(batch_stds).mean(axis=0))
        
        batch_means = []
        batch_stds = []
#     if ix == 500:
#         break

print(np.vstack(means).mean(axis=0))
print(np.vstack(stds).mean(axis=0))