In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns
import os, cv2
import dill
import tensorflow_datasets.public_api as tfds
import albumentations as A
import tensorflow as tf
import json
from mt_utils import *
import dill

In [None]:
# Get the API key for the dataset
! mkdir -p /root/.kaggle/
! cp ../input/api-token/kaggle.json /root/.kaggle/kaggle.json
! mkdir -p /kaggle/tmp/mt_train
! kaggle datasets init -p /kaggle/tmp/mt_train

In [None]:
%%bash
echo "{
  \"title\": \"MTCustomVocabImg\",
  \"id\": \"tchaye59/MTCUSTOMVOCABIMG\",
  \"licenses\": [
    {
      \"name\": \"CC0-1.0\"
    }
  ]
}" > /kaggle/tmp/mt_train/dataset-metadata.json

In [None]:
df = pd.read_csv('../input/bms-molecular-translation/train_labels.csv')
sub_df = pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')

## Get the tokenizer from https://www.kaggle.com/tchaye59/mt-utils

In [None]:
tokenizer = CstTokenizer()
N_VOCAB = len(tokenizer.word_index)+1
tokenizer.word_index

In [None]:
%%time
start = '<start>'
end = '<end>'
# Load tokenized labels
labels = dill.load(open('/kaggle/usr/lib/mt_utils/labels.dill','rb'))
count_elements = dill.load(open('/kaggle/usr/lib/mt_utils/count_elements.dill','rb'))

iids = df.image_id.values
max_seq = max([len(l) for l in labels])
max_seq,N_VOCAB

# Dataset

In [None]:
class Dataset:
    
    def __init__(self, iids,targets=None,counts=None,max_seq=max_seq):
        
        self.iids,self.targets,self.counts = iids,targets,counts
        self.max_seq = max_seq

    def __len__(self):
        return len(self.iids)

    def __getitem__(self, index):
        iid = self.iids[index]
        if self.targets is None:
            return self.get_image(iid),iid
        label = self.targets[index]
        label = pad_sequences([label], maxlen=self.max_seq, padding='post')[0].astype(np.int8)
        return self.get_image(iid),label,self.counts[index]

    def get_image(self, iid):
        name = 'train' if self.targets is not None else 'test'
        path = "../input/bms-molecular-translation/"+name+"/{}/{}/{}/{}.png"
        path = path.format(iid[0], iid[1], iid[2], iid)
        return cv2.imread(path, cv2.IMREAD_UNCHANGED)[:,:,np.newaxis]

In [None]:
dataset = Dataset(iids,labels,count_elements)
test_dataset = Dataset(sub_df.image_id.values)

In [None]:
plt.imshow(dataset[100][0])

In [None]:
plt.imshow(test_dataset[3][0])

# Dataset to tf records 

In [None]:
class TrainDataset(tfds.core.GeneratorBasedBuilder):
    VERSION = tfds.core.Version('0.1.0')
    
    def _split_generators(self, dl_manager):
        return [
            tfds.core.SplitGenerator(
                    name=f'train',
                    gen_kwargs={
                    },
            )
        ]
    
    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            description=(""),
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(shape=(None,None,1)),
                "target": tfds.features.Tensor(shape=(max_seq,),dtype=tf.int8),
                "count": tfds.features.Tensor(dtype=tf.int32,shape=()),
            }),
        )
    
    def _generate_examples(self,**args):
        print(f"Data size: {len(dataset)}")
        for i in range(len(dataset)):
            image,target,count = dataset[i]
            yield i, {
                'image':image,
                'target':target,
                'count':count,
            }

In [None]:
class TestDataset(tfds.core.GeneratorBasedBuilder):
    VERSION = tfds.core.Version('0.1.0')
    
    def _split_generators(self, dl_manager):
        return [
            tfds.core.SplitGenerator(
                    name=f'test',
                    gen_kwargs={
                    },
            )
        ]
    
    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            description=(""),
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(shape=(None,None,1),),
                "image_id": tfds.features.Text(),
            }),
        )
    
    def _generate_examples(self,**args):
        print(f"Data size: {len(test_dataset)}")
        for i in range(len(test_dataset)):
            image,image_id = test_dataset[i]
            yield i, {
                'image':image,
                'image_id':image_id,
            }

# Training dataset

In [None]:
#! cp -rv ../input/mtcustomvocabimg/* /kaggle/tmp/mt_train

In [None]:
%%time
data_dir='/kaggle/tmp/mt_train' 
builder = TrainDataset(data_dir=data_dir)
# The following line creates the train dataset folder containing the tf records files in /kaggle/input
builder.download_and_prepare() 

# Test data

In [None]:
! cp -rv ../input/mtcustomvocabimg/test_dataset/ /kaggle/tmp/mt_train

In [None]:
%%time
data_dir='/kaggle/tmp/mt_train' 
builder = TestDataset(data_dir=data_dir)
# The following line creates the test dataset folder containing the tf records files in /kaggle/input
builder.download_and_prepare() 

## Upload or Update dataset

In [None]:
! kaggle datasets version -p /kaggle/tmp/mt_train -m "update"  --dir-mode tar
#! kaggle datasets create -p /kaggle/tmp/mt_train/ -u --dir-mode tar

In [None]:
# Delete Kaggle API key
! rm -rf /root/.kaggle/kaggle.json

#### Dataset link : https://www.kaggle.com/tchaye59/mtcustomvocabimg
#### Pretraining : https://www.kaggle.com/tchaye59/mt-pretraining
#### Training: https://www.kaggle.com/tchaye59/mt-fast-distributed-training-tpu