In [1]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import albumentations as A
from pathlib import Path
import os
from tqdm import tqdm

In [2]:
datadir_parquet = Path('/home/sayan/Documents/Bengali_Grapheme/data/image_parquets/')

IMG_HEIGHT = 137;
IMG_WIDTH = 236;

In [3]:
def resize_threshold(image,resize_size=64):
    
    #image_bw = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    ret, image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(image,cv2.RETR_TREE,cv2.CHAIN_APPROX_NONE)[-2:] # finding the dark countour
    
    idx = 0 
    ls_xmin = []
    ls_ymin = []
    ls_xmax = []
    ls_ymax = []
    for cnt in contours:
        idx += 1
        x,y,w,h = cv2.boundingRect(cnt)
        ls_xmin.append(x)
        ls_ymin.append(y)
        ls_xmax.append(x + w)
        ls_ymax.append(y + h)
    xmin = min(ls_xmin)
    ymin = min(ls_ymin)
    xmax = max(ls_xmax)
    ymax = max(ls_ymax)
    
    roi = image[ymin:ymax,xmin:xmax]
    resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
    
    return resized_roi

In [4]:
def combine_image_labels(image_df):
    
    reduced_df = pd.DataFrame()
    cols = np.array(range(0,4096),dtype=str)
    
    for i in tqdm(range(image_df.shape[0])):
        
        img_name = image_df.iloc[i][0]
        
        img_OG = np.array(image_df.iloc[i][1:].values.reshape(137,236,1))
        img_resized = resize_threshold(img_OG.astype(np.uint8),resize_size=64)
        img_pd = pd.DataFrame(data=img_resized.reshape(1,4096), columns=cols)
        
        name = pd.DataFrame(data=[img_name], columns=['image_id'])
        new_row = pd.concat([name, img_pd],axis=1,sort=False)
        reduced_df = reduced_df.append(new_row, ignore_index = True) 
    
    return reduced_df

In [5]:
test_df = pd.read_csv('/home/sayan/Documents/Bengali_Grapheme/data/test.csv')

In [6]:
test_image_0 = pd.read_parquet(datadir_parquet/'test_image_data_0.parquet',engine='pyarrow')

In [7]:
combined_test_df_0 = combine_image_labels(test_image_0)

100%|██████████| 3/3 [00:00<00:00, 23.93it/s]


In [8]:
combined_test_df_0.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_test_0.csv',index=False)

In [9]:
test_image_1 = pd.read_parquet(datadir_parquet/'test_image_data_1.parquet',engine='pyarrow')
combined_test_df_1 = combine_image_labels(test_image_1)
combined_test_df_1.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_test_1.csv',index=False)

100%|██████████| 3/3 [00:00<00:00, 25.59it/s]


In [10]:
test_image_2 = pd.read_parquet(datadir_parquet/'test_image_data_2.parquet',engine='pyarrow')
combined_test_df_2 = combine_image_labels(test_image_2)
combined_test_df_2.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_test_2.csv',index=False)

100%|██████████| 3/3 [00:00<00:00, 26.38it/s]


In [11]:
test_image_3 = pd.read_parquet(datadir_parquet/'test_image_data_3.parquet',engine='pyarrow')
combined_test_df_3 = combine_image_labels(test_image_3)
combined_test_df_3.to_csv('/home/sayan/Documents/Bengali_Grapheme/data/combined_test_3.csv',index=False)

100%|██████████| 3/3 [00:00<00:00, 27.35it/s]


In [12]:
combined_test_df_0

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,Test_0,0,0,0,0,0,0,0,255,0,...,0,0,0,0,0,0,0,0,0,0
1,Test_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Test_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
combined_test = pd.concat([combined_test_df_0,combined_test_df_1,combined_test_df_2,combined_test_df_3]).reset_index(drop=True)

In [14]:
combined_test.to_parquet('/home/sayan/Documents/Bengali_Grapheme/data/test_resized.parquet',index=False)