In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
import cv2
from tqdm import tqdm
from PIL import Image

In [2]:
datadir_parquet = Path('/home/sayan/Documents/Bengali_Grapheme/data/image_parquets/')
#datadir_feather = Path('/home/sayan/Documents/Bengali_Grapheme/data/image_feathers/')

IMG_HEIGHT = 137;
IMG_WIDTH = 236;

In [3]:
def resize_threshold(image,resize_size=64):
    
    #image_bw = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    ret, image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(image,cv2.RETR_TREE,cv2.CHAIN_APPROX_NONE)[-2:] # finding the dark countour
    
    idx = 0 
    ls_xmin = []
    ls_ymin = []
    ls_xmax = []
    ls_ymax = []
    for cnt in contours:
        idx += 1
        x,y,w,h = cv2.boundingRect(cnt)
        ls_xmin.append(x)
        ls_ymin.append(y)
        ls_xmax.append(x + w)
        ls_ymax.append(y + h)
    xmin = min(ls_xmin)
    ymin = min(ls_ymin)
    xmax = max(ls_xmax)
    ymax = max(ls_ymax)
    
    roi = image[ymin:ymax,xmin:xmax]
    resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
    
    return resized_roi

In [4]:
def combine_image_labels(train_df,image_df):
    
    reduced_df = pd.DataFrame()
    cols = np.array(range(0,4096),dtype=str)
    
    for i in tqdm(range(50210)):
        
        img_name = image_df.iloc[i][0]
        
        img_OG = np.array(image_df.iloc[i][1:].values.reshape(137,236,1))
        img_resized = resize_threshold(img_OG.astype(np.uint8),resize_size=64)
        labels = train_df[(train_df['image_id'] == img_name)][['grapheme_root','vowel_diacritic','consonant_diacritic']].values
        img_pd = pd.DataFrame(data=img_resized.reshape(1,4096), columns=cols)
        label_pd = pd.DataFrame(data=labels, columns=['grapheme_root','vowel_diacritic','consonant_diacritic'])
        
        name = pd.DataFrame(data=[img_name], columns=['image_id'])
        new_row = pd.concat([name, img_pd, label_pd],axis=1,sort=False)
        reduced_df = reduced_df.append(new_row, ignore_index = True) 
    
    return reduced_df

In [5]:
class_map = pd.read_csv('/home/sayan/Documents/Bengali_Grapheme/data/class_map.csv')

In [6]:
train_df = pd.read_csv('/home/sayan/Documents/Bengali_Grapheme/data/train.csv')

In [7]:
train_image_0 = pd.read_parquet(datadir_parquet/'train_image_data_0.parquet',engine='pyarrow')

In [8]:
combined_train_df_0 = combine_image_labels(train_df,train_image_0)

100%|██████████| 50210/50210 [59:19<00:00, 14.10it/s]  


In [9]:
combined_train_df_0.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_train_0.csv',index=False)

In [10]:
combined_train_df_0.head()

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,...,4089,4090,4091,4092,4093,4094,4095,grapheme_root,vowel_diacritic,consonant_diacritic
0,Train_0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,15,9,5
1,Train_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,159,0,0
2,Train_2,0,0,0,0,0,0,0,0,0,...,60,210,255,255,255,102,4,22,3,5
3,Train_3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,53,2,2
4,Train_4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,71,9,5


In [20]:
train_image_1 = pd.read_parquet(datadir_parquet/'train_image_data_1.parquet',engine='pyarrow')

In [21]:
combined_train_df_1 = combine_image_labels(train_df,train_image_1)

100%|██████████| 50210/50210 [58:15<00:00, 14.36it/s]  


In [22]:
combined_train_df_1.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_train_1.csv',index=False)

In [23]:
train_image_2 = pd.read_parquet(datadir_parquet/'train_image_data_2.parquet',engine='pyarrow')

In [24]:
combined_train_df_2 = combine_image_labels(train_df,train_image_2)

100%|██████████| 50210/50210 [58:53<00:00, 14.21it/s]  


In [25]:
combined_train_df_2.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_train_2.csv',index=False)

In [26]:
train_image_3 = pd.read_parquet(datadir_parquet/'train_image_data_3.parquet',engine='pyarrow')

In [27]:
combined_train_df_3 = combine_image_labels(train_df,train_image_3)

100%|██████████| 50210/50210 [57:34<00:00, 14.53it/s]  


In [28]:
combined_train_df_3.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_train_3.csv',index=False)