# Imports and Constants

In [None]:
import numpy as np
import pandas as pd

In [None]:
DATA_DIR = '/kaggle/input/bengaliai-cv19/'

path_train_data = DATA_DIR + 'train.csv'

In [None]:
df_train = pd.read_csv(path_train_data)
df_train.drop('grapheme', inplace=True, axis=1)
df_train.sample(5)

# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()
sns.set_palette(sns.dark_palette('purple'))

bar_palette = lambda n: sns.hls_palette(n, l=.4)

In [None]:
def plot_frequency(data):
    if isinstance(data, str):
        counts = df_train[data].value_counts()
        name = data
        
    else:
        counts = df_train.groupby(data).size().sort_values(ascending=False)
        name = '-'.join(data)

    counts /= len(df_train)
    n = len(counts)
    x = np.arange(n)
    y = counts.cumsum()
    
    plt.figure(figsize=(20, 5))
    if n <= 100:
        plt.subplot(1, 2, 1)
    sns.barplot(x=counts.index, y=counts, order=counts.index, palette=bar_palette(n))
    plt.title('Frequency of each %s class [%d]' % (name, n))
    plt.ylabel('Frequency')
    
    if n > 40:
        plt.xticks([])
    else:
        plt.xlabel('Class label')
    
    if n > 100:
        plt.show()
        plt.figure(figsize=(20, 5))
       
    else:
        plt.subplot(1, 2, 2)
        
    plt.fill_between(x, y, step='post', alpha=0.4)
    plt.step(x, y, where='post')
    plt.ylim(0, 1.05)
    plt.title('Cumulative frequency of each %s class' % name)
    plt.xlabel('Number of classes')
    plt.ylabel('Cumulative frequency')
    plt.show()

## Single Class Frequencies

### Consonant Diacritic

7 consonant diacritics. About 60% of all data has class 1. Classes 3 and 6 are extremely rare.

In [None]:
plot_frequency('consonant_diacritic')

### Vowel Diacritic

11 Vowels diacritics. Classes 0 and 1 are common. Classes 5, 6, 8 and 10 are rare.

In [None]:
plot_frequency('vowel_diacritic')

### Grapheme Root

168 grapheme roots. Classes are quite unbalanced. First 10 classes are 25% of all data. First 30 classes are 50% of all data.

In [None]:
plot_frequency('grapheme_root')

## Double Class Frequencies

### Consonant Diacritic - Vowel Diacritic

About 62% of all possible combinations appear in the training set. Classes are quite unbalanced. First 5 classes are about half of all data.

In [None]:
plot_frequency(['consonant_diacritic', 'vowel_diacritic'])

### Consonant Diacritic - Grapheme Root

About 28% of all possible combinations appear in the training set. Classes are relatively more balanced than "consonant diacritic - vowel diacritic".

In [None]:
plot_frequency(['consonant_diacritic', 'grapheme_root'])

### Vowel Diacritic - Grapheme Root

About 44% of all possible combinations appear in the training set.

In [None]:
plot_frequency(['vowel_diacritic', 'grapheme_root'])

## Total Frequencies

About 10% of all possible combinations appear in the training set. Surprisingly all classes have very similar frequencies.

In [None]:
plot_frequency(['consonant_diacritic', 'vowel_diacritic', 'grapheme_root'])