# Bengali.AI
-----
### Quick data exploration
In the next couple of days, I'll continue to explore the BengaliAI dataset, stay tuned.

In [None]:
import os
import pandas as pd
import numpy as np
import PIL.Image as Image, PIL.ImageDraw as ImageDraw, PIL.ImageFont as ImageFont

import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [None]:
HEIGHT = 137
WIDTH = 236

In [None]:
def load_as_npa(file):
    df = pd.read_parquet(file)
    return df.iloc[:, 0], df.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH)

def image_from_char(char):
    image = Image.new('RGB', (WIDTH, HEIGHT))
    draw = ImageDraw.Draw(image)
    myfont = ImageFont.truetype('/kaggle/input/bengaliai/hind_siliguri_normal_500.ttf', 120)
    w, h = draw.textsize(char, font=myfont)
    draw.text(((WIDTH - w) / 2,(HEIGHT - h) / 2), char, font=myfont)

    return image

In [None]:
image_ids0, images0 = load_as_npa('/kaggle/input/bengaliai-cv19/train_image_data_0.parquet')

In [None]:
f, ax = plt.subplots(5, 5, figsize=(16, 8))
ax = ax.flatten()

for i in range(25):
    ax[i].imshow(images0[i], cmap='Greys')

### train.csv
- **image_id**: the foreign key for the parquet files
- **grapheme_root**: the first of the three target classes
- **vowel_diacritic**: the second target class
- **consonant_diacritic**: the third target class
- **grapheme**: the complete character. Provided for informational purposes only, you should not need to use this.

In [None]:
train_df = pd.read_csv('/kaggle/input/bengaliai-cv19/train.csv')
train_df.head()

In [None]:
train_df.shape

In [None]:
class_map_df = pd.read_csv('/kaggle/input/bengaliai-cv19/class_map.csv')
class_map_df.head()

# Grapheme root

In [None]:
print("Number of unique grapheme_root: {}".format(train_df['grapheme_root'].nunique()))

In [None]:
fig = go.Figure(data=[go.Histogram(x=train_df['grapheme_root'])])
fig.update_layout(title_text='`grapheme_root` values')
fig.show()

It seems that `grapheme_root` is highly imbalanced.

## Most common `grapheme_root` values

In [None]:
x = train_df['grapheme_root'].value_counts().sort_values()[-20:].index
y = train_df['grapheme_root'].value_counts().sort_values()[-20:].values
fig = go.Figure(data=[go.Bar(x=x, y=y)])
fig.update_layout(title_text='Most common `grapheme_root` values')
fig.show()

In [None]:
common_gr = class_map_df[(class_map_df['component_type'] == 'grapheme_root') & (class_map_df['label'].isin(x))]['component']

In [None]:
f, ax = plt.subplots(4, 5, figsize=(16, 8))
ax = ax.flatten()

for i in range(20):
    ax[i].imshow(image_from_char(common_gr.values[i]), cmap='Greys')

## Least common `grapheme_root` values

In [None]:
x = train_df['grapheme_root'].value_counts().sort_values()[:20].index
y = train_df['grapheme_root'].value_counts().sort_values()[:20].values
fig = go.Figure(data=[go.Bar(x=x, y=y)])
fig.update_layout(title_text='Least common `grapheme_root` values')
fig.show()

In [None]:
notcommon_gr = class_map_df[(class_map_df['component_type'] == 'grapheme_root') & (class_map_df['label'].isin(x))]['component']

In [None]:
f, ax = plt.subplots(4, 5, figsize=(16, 8))
ax = ax.flatten()

for i in range(20):
    ax[i].imshow(image_from_char(notcommon_gr.values[i]), cmap='Greys')

# Vowel diacritic

In [None]:
train_df['vowel_diacritic'].nunique()

In [None]:
x = train_df['vowel_diacritic'].value_counts().sort_values().index
y = train_df['vowel_diacritic'].value_counts().sort_values().values
fig = go.Figure(data=[go.Bar(x=x, y=y)])
fig.update_layout(title_text='`vowel_diacritic` values')
fig.show()

In [None]:
vowels = class_map_df[(class_map_df['component_type'] == 'vowel_diacritic') & (class_map_df['label'].isin(x))]['component']

In [None]:
f, ax = plt.subplots(3, 5, figsize=(16, 8))
ax = ax.flatten()

for i in range(15):
    if i < len(vowels):
        ax[i].imshow(image_from_char(vowels.values[i]), cmap='Greys')

# Consonant diacritic

In [None]:
train_df['consonant_diacritic'].nunique()

In [None]:
x = train_df['consonant_diacritic'].value_counts().sort_values().index
y = train_df['consonant_diacritic'].value_counts().sort_values().values
fig = go.Figure(data=[go.Bar(x=x, y=y)])
fig.update_layout(title_text='`consonant_diacritic` values')
fig.show()

In [None]:
consonants = class_map_df[(class_map_df['component_type'] == 'consonant_diacritic') & (class_map_df['label'].isin(x))]['component']

In [None]:
f, ax = plt.subplots(1, 7, figsize=(16, 8))
ax = ax.flatten()

for i in range(7):
    ax[i].imshow(image_from_char(consonants.values[i]), cmap='Greys')

# Similar Graphemes
The most common `grapheme_root` is `দ`. Let's check some variants.

In [None]:
train_df = train_df[0:50000]

# Most common grapheme_root
gr_root_component = class_map_df[(class_map_df['component_type'] == 'grapheme_root') & (class_map_df['label'] == 72)]['component']
plt.imshow(image_from_char(gr_root_component[72]), cmap='Greys')

### Digital variants of the most common `grapheme_root`

In [None]:
samples = train_df[train_df['grapheme_root'] == 72].sample(n=25)
# samples.reset_index(drop=True, inplace=True)

f, ax = plt.subplots(5, 5, figsize=(16, 8))
ax = ax.flatten()
k = 0
for i, row in samples.iterrows():
    ax[k].imshow(image_from_char(row['grapheme']), cmap='Greys')
    k = k + 1

### Handwritten variants of the most common `grapheme_root`

The samples below are the handwritten pairs of the digital ones above.

In [None]:
f, ax = plt.subplots(5, 5, figsize=(16, 8))
ax = ax.flatten()
k = 0
for i, row in samples.iterrows():
    ax[k].imshow(images0[i], cmap='Greys')
    k = k + 1

### Examples of grapheme root `দ` without vowel_diacritic and consonant_diacritic components.

In [None]:
samples = train_df[
    (train_df['grapheme_root'] == 72) &
    (train_df['vowel_diacritic'] == 0) &
    (train_df['consonant_diacritic'] == 0)
].sample(n=25)

f, ax = plt.subplots(5, 5, figsize=(16, 8))
ax = ax.flatten()
k = 0
for i, row in samples.iterrows():
    ax[k].imshow(images0[i], cmap='Greys')
    k = k + 1

----------------
**Thanks for reading. Please vote if you find this notebook useful.**