In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import os
import joblib
import tensorflow as tf
import keras
import cv2
from tqdm import tqdm
from PIL import Image
import gc
from math import ceil

from keras.layers import Input
from keras import backend as K 
from keras.preprocessing.image import ImageDataGenerator 
from keras import layers, models, optimizers 
from keras.models import Sequential, Model
from keras.utils.vis_utils import plot_model
from albumentations.augmentations import functional as F
from keras.models import load_model

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
# !mkdir '../working/test_data'

In [None]:
HEIGHT = 137
WIDTH = 236

data_dir = '../input/bengaliai-cv19/'
files_test = [f'test_image_data_{fid}.parquet' for fid in range(4)]
files_test

In [None]:
model = models.load_model('../input/bengal-model/baseline_v5.h5')
# model.load_weights('../input/bengal-model/baseline_v5_weights.hdf5')
model.summary()

In [None]:
def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return rmin, rmax, cmin, cmax

# Image Prep
def crop_image(img, WIDTH_NEW, HEIGHT_NEW):
    # Invert
    pad = 16
    size = 128
#     img = 255 - img.reshape(137, 236)
#     print(img0.shape)
#     plt.imshow(img0)
#     plt.show()
    
    ymin,ymax,xmin,xmax = bbox(img[5:-5,5:-5] > 80)
    #cropping may cut too much, so we need to add it back
    xmin = xmin - 13 if (xmin > 13) else 0
    ymin = ymin - 10 if (ymin > 10) else 0
    xmax = xmax + 13 if (xmax < WIDTH - 13) else WIDTH
    ymax = ymax + 10 if (ymax < HEIGHT - 10) else HEIGHT
    img = img[ymin:ymax,xmin:xmax]
    #remove lo intensity pixels as noise
    img[img < 28] = 0
    lx, ly = xmax-xmin,ymax-ymin
    l = max(lx,ly) + pad
    #make sure that the aspect ratio is kept in rescaling
    img = np.pad(img, [((l-ly)//2,), ((l-lx)//2,)], mode='constant')
    img = cv2.resize(img,(WIDTH_NEW,HEIGHT_NEW))
    img = img / 255
    return img

def resize(df, size=64):
    resized = {}
    
    for i in tqdm(range(df.shape[0])):
        image= df.loc[df.index[i]].values
        image = crop_image(image, 128, 128)
        image = image.reshape(128*128)
        resized[df.index[i]] = image
        
    del df
    df = 0
    image = 0
    gc.collect()
    print('collect df')
    resized = np.array(list(resized.values()))
#     resized = pd.DataFrame(index=resized.keys(), data=resized.values())
    print('return')
    return resized

In [None]:
def test_batch_generator(df, batch_size):
    num_imgs = len(df)

    for batch_start in tqdm(range(0, num_imgs, batch_size)):
        curr_batch_size = min(num_imgs, batch_start + batch_size) - batch_start
        idx = np.arange(batch_start, batch_start + curr_batch_size)

        names_batch = df.iloc[idx, 0].values
        imgs_batch = 255 - df.iloc[idx, 1:].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
        X_batch = np.zeros((curr_batch_size, 128, 128, 1))
        
        for j in range(curr_batch_size):
#             img = (imgs_batch[j,]*(255.0/imgs_batch[j,].max())).astype(np.uint8)
#             img = resize_image(img, 128, 128)
#             img = (img.astype(np.float32)/255.0 - stats[0])/stats[1]
#             img = img[:, :, np.newaxis]
#             img = resize_img(imgs_batch[j,], 128, 128)
            img = imgs_batch[j,] #shape 137,236
            img = crop_image(img, 128, 128) # shape 128, 128
            img = np.expand_dims(img, axis=2) #shape 128, 128, 1
#             print(img.shape)
            X_batch[j,] = img

        yield X_batch, names_batch

In [None]:
# test_df = pd.read_csv('../input/bengaliai-cv19/test.csv')
# test_df['name'] = test_df['image_id'] +'_' + test_df['component']
# test_df['name'].values

In [None]:
test_df = pd.read_csv('../input/bengaliai-cv19/test.csv')
test_df['name'] = test_df['image_id'] +'_' + test_df['component']

component = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']
test_df.head()

In [None]:
test_df[test_df['image_id'] == 'Test_1'].iloc[0]['component'] # 0, 1, 2

In [None]:
# load the parquet files 
# TEST = [
#     "../input/bengaliai-cv19/test_image_data_0.parquet",
#     "../input/bengaliai-cv19/test_image_data_1.parquet",
#     "../input/bengaliai-cv19/test_image_data_2.parquet",
#     "../input/bengaliai-cv19/test_image_data_3.parquet",
# ]

# placeholders 
row_id = []
target = []
test_df = pd.read_csv('../input/bengaliai-cv19/test.csv')
test_df['name'] = test_df['image_id'] +'_' + test_df['component']

component = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']


# iterative over the test sets
for i in range(4):
    test_ = pd.read_parquet('../input/bengaliai-cv19/'+files_test[i])
    test_gen = test_batch_generator(test_, batch_size=64)

    for batch_x, batch_name in test_gen:
        # prediction
        batch_predict0 = model.predict(batch_x, batch_size = 64)

        for idx, name in enumerate(batch_name):
#             print(name)
            for i in range(len(test_df[test_df['image_id'] == name])):
                if test_df[test_df['image_id'] == name].iloc[i]['component'] == 'consonant_diacritic':
                    target.append(np.argmax(batch_predict0[2], axis=1)[idx])
                elif test_df[test_df['image_id'] == name].iloc[i]['component'] == 'grapheme_root':
                    target.append(np.argmax(batch_predict0[0], axis=1)[idx])
                elif test_df[test_df['image_id'] == name].iloc[i]['component'] == 'vowel_diacritic':
                    target.append(np.argmax(batch_predict0[1], axis=1)[idx])
#             target += [
#                 np.argmax(batch_predict0[2], axis=1)[idx],
#                 np.argmax(batch_predict0[0], axis=1)[idx],
#                 np.argmax(batch_predict0[1], axis=1)[idx],
#             ]

    del test_
    gc.collect()
    
row_id = test_df['name'].values

In [None]:
df_sample = pd.DataFrame(
    {
        'row_id': row_id,
        'target':target
    },
    columns = ['row_id','target'] 
)
# df_sample.fillna(0)
df_sample.to_csv('submission.csv',index=False)
# print(type(df_sample['target'].values[0]))
# print(type(df_sample['row_id'].values[0]))

In [None]:
# class TestDataGenerator(keras.utils.Sequence):
#     def __init__(self, X, batch_size = 16, img_size = (128, 128, 1), *args, **kwargs):
#         self.X = X
#         self.indices = np.arange(len(self.X))
#         self.batch_size = batch_size
#         self.img_size = img_size
                    
#     def __len__(self):
#         return int(ceil(len(self.X) / self.batch_size))

#     def __getitem__(self, index):
#         indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
#         X = self.__data_generation(indices)
#         return X
    
#     def __data_generation(self, indices):
#         X = np.empty((self.batch_size, *self.img_size))
        
#         for i, index in enumerate(indices):
#             image = self.X[index]
#             crop_X = resize_image(image, 128, 128)
# #             plt.imshow(crop_X)
# #             plt.show()
#             crop_X = np.expand_dims(crop_X, axis=2)
#             X[i,] = crop_X
#         return X

In [None]:
# pred_dict = {
#     'grapheme_root': [],
#     'vowel_diacritic': [],
#     'consonant_diacritic': []
# }

# components = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']
# target=[] # model predictions placeholder
# row_id=[] # row_id place holder
# for i in range(4):
#     df_test_img = pd.read_parquet('../input/bengaliai-cv19/'+files_test[i])
#     df_test_img.set_index('image_id',inplace=True)
#     df_test_img_index = df_test_img.index
    
#     df_test_img = resize(df_test_img)
# #     resized = pd.DataFrame(index=resized.keys(), data=resized.values())
#     gc.collect()
#     print('reshape')
#     df_test_img = df_test_img.reshape(-1, 128, 128, 1)
#     print('preds start')
#     preds = model.predict(df_test_img, batch_size=16)
#     del df_test_img
#     gc.collect()
    
#     print('gc collect finished')
    
#     for i, p in enumerate(pred_dict):
#         pred_dict[p] = np.argmax(preds[i], axis=1)

#     for k,id in enumerate(df_test_img_index):  
#         for i,comp in enumerate(components):
#             id_sample=id+'_'+comp
#             row_id.append(id_sample)
#             target.append(pred_dict[comp][k])
            


# df_sample = pd.DataFrame(
#     {
#         'row_id': row_id,
#         'target':target
#     },
#     columns = ['row_id','target'] 
# )

# df_sample.to_csv('submission.csv',index=False)

In [None]:
# tgt_cols = ['grapheme_root','vowel_diacritic','consonant_diacritic']
# row_ids, targets = [], []


# df = pd.read_parquet('../input/bengaliai-cv19/'+files_test[0])
# test_files = list(df['image_id'].values)
# df = df.drop(['image_id'], axis=1)
# df = df.values
# data_generator_test = TestDataGenerator(df, batch_size = 1, img_size = (128, 128, 1))

# # Predict with all 3 models
# preds1 = model.predict_generator(data_generator_test, verbose = 1)
# print(preds1[0].shape)
# print(preds1[1].shape)
# print(preds1[2].shape)


# for i, image_id in zip(range(len(test_files)), test_files):
#     print('i', i)
#     print('image_id',image_id)
#     for subi, col in zip(range(len(preds1)), tgt_cols):
# #         print('subi', subi)
# #         print('col', col)
#         sub_preds1 = preds1[subi]
        
#         row_ids.append(str(image_id)+'_'+col)
#         sub_pred_value = np.argmax(sub_preds1[i])
#         targets.append(sub_pred_value)
# del df 
# gc.collect()

In [None]:
# tgt_cols = ['grapheme_root','vowel_diacritic','consonant_diacritic']
# row_ids, targets = [], []

# for i in range(0, 4):
#     df = pd.read_parquet('../input/bengaliai-cv19/'+files_test[i])
#     test_files = list(df['image_id'].values)
#     df = df.drop(['image_id'], axis=1)
#     df = df.values
#     data_generator_test = TestDataGenerator(df, batch_size = 16, img_size = (128, 128, 1))

#     # Predict with all 3 models
#     preds1 = model.predict_generator(data_generator_test, verbose = 1)
# #     print(len(preds1))
#     for i, image_id in zip(range(len(test_files)), test_files):
#         for subi, col in zip(range(len(preds1)), tgt_cols):
#             print('subi', subi)
#             sub_preds1 = preds1[subi]
#             row_ids.append(str(image_id)+'_'+col)
#             print(sub_preds1[i][np.argmax(sub_preds1[i])])
#             sub_pred_value = np.argmax(sub_preds1[i])
#             targets.append(sub_pred_value)
#     del df 
#     gc.collect()

In [None]:
# sample_sub = pd.read_csv('../input/bengaliai-cv19/sample_submission.csv')
# print(type(sample_sub['target'].values[0]))
# print(type(sample_sub['row_id'].values[0]))
# print(sample_sub.info())
# sample_sub.head()

In [None]:
# submit_df = pd.DataFrame({'row_id':row_ids,'target':targets}, columns = ['row_id','target'])
# submit_df.to_csv('submission.csv', index = False)
# print(type(submit_df['target'].values[0]))
# print(type(submit_df['row_id'].values[0]))
# print(submit_df.info())
# submit_df.tail()

In [None]:
# # sort by test
# test_csv = pd.read_csv('../input/bengaliai-cv19/test.csv')
# sorter = list(test_csv['row_id'].values)
# submit_df_sort = pd.DataFrame({'row_id':row_ids,'target':targets}, columns = ['row_id','target'])
# submit_df_sort.row_id = submit_df_sort.row_id.astype('category')
# submit_df_sort.row_id.cat.set_categories(sorter,inplace=True)
# submit_df_sort = submit_df_sort.sort_values(['row_id'])
# submit_df_sort.row_id = submit_df_sort.row_id.astype('str')
# submit_df_sort = submit_df_sort.reset_index(drop=True)
# # submit_df.to_csv('submission.csv', index = False)
# print(type(submit_df['target'].values[0]))
# print(type(submit_df['row_id'].values[0]))
# print(submit_df.info())
# submit_df_sort.head()

In [None]:
# Create and Save Submission File
# submit_df = pd.DataFrame({'row_id':row_ids,'target':targets}, columns = ['row_id','target'])
# submit_df.row_id = submit_df.row_id.astype('category')
# submit_df.row_id.cat.set_categories(sorter,inplace=True)
# submit_df = submit_df.sort_values(['row_id'])
# submit_df.row_id = submit_df.row_id.astype('str')
# submit_df = submit_df.reset_index(drop=True)
# submit_df.to_csv('submission.csv', index = False)
# print(submit_df.head())
# print(type(submit_df['target'].values[0]))
# print(type(submit_df['row_id'].values[0]))
# print(submit_df.info())

In [None]:
# sample_sub = pd.read_csv('../input/bengaliai-cv19/sample_submission.csv')
# print(type(sample_sub['target'].values[0]))
# print(type(sample_sub['row_id'].values[0]))
# print(sample_sub.head())
# print(sample_sub.info())