In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
import zipfile
import io
import warnings
warnings.filterwarnings("ignore")
import unicodedata
from sys import getsizeof

# To convert text to image for getting images of Bengali graphemes
import text_to_image
import PIL.Image as Image, PIL.ImageDraw as ImageDraw, PIL.ImageFont as ImageFont
%matplotlib inline


# 1.0 Load Datasets

In [3]:
df_class_map = pd.read_csv('class_map.csv')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_images0 = pd.read_parquet('train_image_data_0.parquet')

In [23]:
print("df_class_map.shape:",df_class_map.shape)
print("df_train:",df_train.shape)
print("df_test:",df_test.shape)
print("df_images0:",df_images0.shape)

print(f"Total of {df_train.shape[0]} number of images")

df_class_map.shape: (186, 4)
df_train: (200840, 5)
df_test: (36, 3)
df_images0: (50210, 32333)
Total of 200840 number of images


In [6]:
print(df_class_map.shape)
df_class_map.head()

(186, 3)


Unnamed: 0,component_type,label,component
0,grapheme_root,0,ং
1,grapheme_root,1,ঃ
2,grapheme_root,2,অ
3,grapheme_root,3,আ
4,grapheme_root,4,ই


In [50]:
df_class_map.query("component_type== 'grapheme_root'").head()

Unnamed: 0,component_type,label,component,num_comps
0,grapheme_root,0,ং,1.0
1,grapheme_root,1,ঃ,1.0
2,grapheme_root,2,অ,1.0
3,grapheme_root,3,আ,1.0
4,grapheme_root,4,ই,1.0


In [51]:
df_class_map.query("component_type== 'vowel_diacritic'").head()

Unnamed: 0,component_type,label,component,num_comps
168,vowel_diacritic,0,0,1.0
169,vowel_diacritic,1,া,1.0
170,vowel_diacritic,2,ি,1.0
171,vowel_diacritic,3,ী,1.0
172,vowel_diacritic,4,ু,1.0


In [54]:
df_class_map.query("component_type== 'consonant_diacritic'").head()

Unnamed: 0,component_type,label,component,num_comps
179,consonant_diacritic,0,0,1.0
180,consonant_diacritic,1,ঁ,1.0
181,consonant_diacritic,2,র্,1.5
182,consonant_diacritic,3,র্য,2.0
183,consonant_diacritic,4,্য,1.5


In [13]:
df_class_map['num_comps'] = 0
df_class_map['num_comps'] = df_class_map.component.apply(lambda x: (len(x)+1)/2)
df_class_map.tail()
# for row in df_class_map.iterrows():
#     df_class_map.loc[row[0],'num_comps'] = (len(row[1].component)+1)/2

Unnamed: 0,component_type,label,component,num_comps
181,consonant_diacritic,2,র্,1.5
182,consonant_diacritic,3,র্য,2.0
183,consonant_diacritic,4,্য,1.5
184,consonant_diacritic,5,্র,1.5
185,consonant_diacritic,6,্র্য,2.5


In [19]:
df_train.head(250).tail()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
245,Train_245,133,0,4,শ্য
246,Train_246,22,7,0,খে
247,Train_247,115,0,0,ম
248,Train_248,96,10,0,পৌ
249,Train_249,124,10,0,লৌ


# Grapheme Root Distribution in Training Data

In [41]:
groot_dist_df = pd.DataFrame(df_class_map.component[df_train.grapheme_root]).reset_index(drop = True)
groot_dist_df.head()
# plt.figure(figsize = (10,10))
# plt.hist(grapheme_dist_df.component)

Unnamed: 0,component
0,ক্ট
1,হ
2,খ
3,ট
4,থ


In [46]:
df = groot_dist_df
df.component.value_counts(ascending=True)

দ্ঘ       130
ঙ্ক্ত     136
প্স       141
স্স       143
জ্জ্ব     144
         ... 
গ        5149
ব        5321
ক        5420
ত        5596
দ        5736
Name: component, Length: 168, dtype: int64