In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
import zipfile
import io
import warnings
warnings.filterwarnings("ignore")
import unicodedata
from sys import getsizeof

# To convert text to image for getting images of Bengali graphemes
import PIL.Image as Image, PIL.ImageDraw as ImageDraw, PIL.ImageFont as ImageFont
%matplotlib inline

from google.colab import drive
drive.mount('/content/drive/')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
main_path = '/content/drive/My Drive/0.0_Colab/bengaliai-cv19/'

In [0]:
df_class_map = pd.read_csv(main_path + 'class_map.csv')
df_train = pd.read_csv(main_path + 'train.csv')
df_test = pd.read_csv(main_path + 'test.csv')
df_images0 = pd.read_parquet(main_path + 'train_image_data_0.parquet')

# Class Map
The class map lists the three components of each grapheme:
* Grapheme root
* Vowel diacritic
* Consonant diacritic

The grapheme roots may be comprised of one or more even basic graphemes. 

## Grapheme Roots


In [4]:
df_class_map.head()

Unnamed: 0,component_type,label,component
0,grapheme_root,0,ং
1,grapheme_root,1,ঃ
2,grapheme_root,2,অ
3,grapheme_root,3,আ
4,grapheme_root,4,ই


In [14]:
print(len(df_class_map.query("component_type == 'grapheme_root'").component), "grapheme roots")
print(set(df_class_map.query("component_type == 'grapheme_root'").component))

168 grapheme roots
{'ল্ড', 'এ', 'ও', 'শ্ব', 'ম্ম', 'হ্ম', 'ব্ব', 'জ', 'ন্ঠ', 'ক্ষ্ণ', 'ত্ম', 'ন্ত', 'য়', 'ব্ধ', 'ষ্ঠ', 'স্ট', 'ন্দ', 'ল', 'ঘ্ন', 'ক্ত', 'গ্ম', 'চ', 'জ্জ্ব', 'ড', 'ঙ্ক্ষ', 'ত্থ', 'ল্ট', 'ষ্প', 'স্থ', 'ভ', 'ষ', 'ল্ক', 'শ্ন', 'স্ন', 'ঈ', 'হ্ব', 'ঙ্গ', 'ষ্ক', 'ব্জ', 'ঋ', 'ম্ভ', 'ফ্ফ', 'ঔ', 'উ', 'ল্প', 'দ', 'ত্ত্ব', 'স্ত', 'জ্ব', 'ঞ্জ', 'ন্জ', 'ফ্ল', 'শ্ম', 'খ', 'শ', 'চ্ছ্ব', 'ফ্ট', 'ল্ম', 'র', 'ঝ', 'ম্ল', 'ষ্ফ', 'থ', 'দ্ধ', 'ক্ষ', 'ঃ', 'দ্ভ', 'দ্ঘ', 'ষ্ণ', 'ষ্ম', 'ণ্ণ', 'দ্ম', 'চ্ছ', 'ন্ট', 'জ্জ', 'ঞ্ছ', 'ণ', 'প', 'প্স', 'ই', 'ব', 'দ্দ', 'ঢ', 'ড়', 'গ্ন', 'হ্ল', 'ন্ড', 'গ্ল', 'স', 'স্ম', 'ম্ন', 'ঐ', 'ক্ল', 'ল্গ', 'স্ক', 'ব্ল', 'গ্ধ', 'ল্ল', 'স্ল', 'স্প', 'ঠ', 'শ্ল', 'ক্স', 'ত', 'প্ট', 'ঙ', 'প্ত', 'শ্চ', 'ব্দ', 'ম', 'ঘ', 'ন্স', 'ক', 'প্ল', 'ত্ত', 'ম্প', 'ভ্ল', 'ক্ষ্ম', 'ণ্ঠ', 'স্স', 'স্ব', 'ঞ', 'ল্ব', 'ন্দ্ব', 'ণ্ড', 'ক্ক', 'ন', 'ধ্ব', 'ঞ্চ', 'ঊ', 'ৎ', 'ঙ্খ', 'ছ', 'ং', 'ত্ব', 'ন্ম', 'ঙ্ক', 'ট্ট', 'ম্ব', 'ক্ট', 'ধ', 'ফ', 'হ্ন', 'ঙ্ঘ', 'য', 'স্ফ', 'ন্থ', 'প্প', 'দ্ব', 'ন্ব', 'ড

## Consonant Diacritics

In [15]:
print(len(df_class_map.query("component_type == 'consonant_diacritic'").component), "consonant diacritics")
print(set(df_class_map.query("component_type == 'consonant_diacritic'").component))

7 consonant diacritics
{'র্', 'ঁ', '0', '্র', '্র্য', 'র্য', '্য'}


## Vowel Diacritics

In [16]:
print(len(df_class_map.query("component_type == 'vowel_diacritic'").component), "vowel diacritics")
print(set(df_class_map.query("component_type == 'vowel_diacritic'").component))

11 vowel diacritics
{'া', 'ি', 'ে', 'ৌ', 'ু', 'ৃ', 'ূ', 'ো', '0', 'ৈ', 'ী'}


# Grapheme Root Length
Each grapheme root contains of one consonant element, such as জ (ja) or 2 elements, such as ত্ম, or 3 elements, such as 'জ্জ্ব'. In the dataset for this project, no grapheme roots are more than three elements long. 

In [0]:
df_class_map['num_comps'] = 0
df_class_map['num_comps'] = df_class_map.component.apply(lambda x: (len(x.replace('্',''))))
df_class_map.tail()

Unnamed: 0,component_type,label,component,num_comps
181,consonant_diacritic,2,র্,1
182,consonant_diacritic,3,র্য,2
183,consonant_diacritic,4,্য,1
184,consonant_diacritic,5,্র,1
185,consonant_diacritic,6,্র্য,2


In [0]:
df_class_map.head()

Unnamed: 0,component_type,label,component,num_comps
0,grapheme_root,0,ং,1
1,grapheme_root,1,ঃ,1
2,grapheme_root,2,অ,1
3,grapheme_root,3,আ,1
4,grapheme_root,4,ই,1


In [0]:
grapheme_roots = set(df_class_map.query("component_type=='grapheme_root'").component)
consonant_diacritics = set(df_class_map.query("component_type=='consonant_diacritic'").component)
vowel_diacritics = set(df_class_map.query("component_type=='vowel_diacritic'").component)

In [0]:
df_train.head()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_0,15,9,5,ক্ট্রো
1,Train_1,159,0,0,হ
2,Train_2,22,3,5,খ্রী
3,Train_3,53,2,2,র্টি
4,Train_4,71,9,5,থ্রো


In [0]:
df_train_expanded = df_train[['image_id', 'grapheme']]
df_train_expanded['length'] = 0
df_train_expanded['length'] = df_train_expanded.grapheme.apply(lambda x: (len(x.replace('্',''))))


In [0]:
df_train_expanded.head()

Unnamed: 0,image_id,grapheme,length
0,Train_0,ক্ট্রো,4
1,Train_1,হ,1
2,Train_2,খ্রী,3
3,Train_3,র্টি,3
4,Train_4,থ্রো,3


# Compound Graphemes

Some graphemes, such as 'ক্ট্রো' are comprised of multiple graphemes along with the respective vowel or consonant diacritic. 

In [0]:
strs = ['ক্ট্রো', 'থ্রো', 'খ্রী']
for str in strs:
  print(str + " = ")
  for s in str:
    print(s + ' + ')
  print('\n')


ক্ট্রো = 
ক + 
্ + 
ট + 
্ + 
র + 
ো + 


থ্রো = 
থ + 
্ + 
র + 
ো + 


খ্রী = 
খ + 
্ + 
র + 
ী + 




Since many of the basic grapheme units in the class map comprise of multiple graphemes, we will not expand them to include all

In [0]:
df_class_map.sort_values(by = "num_comps", ascending=False)

Unnamed: 0,component_type,label,component,num_comps
41,grapheme_root,41,চ্ছ্ব,3
90,grapheme_root,90,ন্দ্ব,3
20,grapheme_root,20,ক্ষ্ম,3
19,grapheme_root,19,ক্ষ্ণ,3
66,grapheme_root,66,ত্ত্ব,3
...,...,...,...,...
139,grapheme_root,139,ষ,1
23,grapheme_root,23,গ,1
22,grapheme_root,22,খ,1
147,grapheme_root,147,স,1


Thus, we see that there are a maximum of 3 basic grapheme that make up the graphemes in the class map. 

In [0]:
str = 'চ্ছ্ব'
list(str.replace('্', ''))

['চ', 'ছ', 'ব']

In [0]:
df_roots = pd.DataFrame()
for row in df_class_map.iterrows():
  comp = row[1].component
  elements = list(str.replace('্', ''))
  


SyntaxError: ignored