In [13]:
import numpy as np
import pandas as pd
import os

"""
This will load a csv file and return a pandas dataframe
@input: filepath:: string :: filepath of csv file
@input: names:: list :: column names list 
"""
def load_csv(filepath, names=None):
    filepath =  os.path.abspath(filepath)
    if os.path.exists(filepath):
        df = pd.read_csv(filepath, sep=",", names=names, encoding='utf-8')
        return df
    
    return None

In [21]:
class_map_path = "data/class_map.csv"
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

class_df = load_csv(class_map_path)
class_df.head()

Unnamed: 0,component_type,label,component
0,grapheme_root,0,ং
1,grapheme_root,1,ঃ
2,grapheme_root,2,অ
3,grapheme_root,3,আ
4,grapheme_root,4,ই


In [30]:
grapheme_roots = class_df[class_df["component_type"] == "grapheme_root"]
grapheme_roots.head()

Unnamed: 0,component_type,label,component
0,grapheme_root,0,ং
1,grapheme_root,1,ঃ
2,grapheme_root,2,অ
3,grapheme_root,3,আ
4,grapheme_root,4,ই


In [33]:
vowel_diacritic = class_df[class_df["component_type"] == "vowel_diacritic"]
vowel_diacritic.head()

Unnamed: 0,component_type,label,component
168,vowel_diacritic,0,0
169,vowel_diacritic,1,া
170,vowel_diacritic,2,ি
171,vowel_diacritic,3,ী
172,vowel_diacritic,4,ু


In [34]:
consonant_diacritic = class_df[class_df["component_type"] == "consonant_diacritic"]
consonant_diacritic.head()

Unnamed: 0,component_type,label,component
179,consonant_diacritic,0,0
180,consonant_diacritic,1,ঁ
181,consonant_diacritic,2,র্
182,consonant_diacritic,3,র্য
183,consonant_diacritic,4,্য


In [45]:
train_df = load_csv(train_data_path)
train_df

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_0,15,9,5,ক্ট্রো
1,Train_1,159,0,0,হ
2,Train_2,22,3,5,খ্রী
3,Train_3,53,2,2,র্টি
4,Train_4,71,9,5,থ্রো
...,...,...,...,...,...
200835,Train_200835,22,7,2,র্খে
200836,Train_200836,65,9,0,ত্তো
200837,Train_200837,2,1,4,অ্যা
200838,Train_200838,152,9,0,স্নো


In [50]:
train_df[train_df["image_id"]=="Train_200835"]

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
200835,Train_200835,22,7,2,র্খে


In [61]:
merged_df = train_df.merge(grapheme_roots, how ="outer", left_on="grapheme_root", right_on="label")
merged_df = merged_df.drop(columns=["component_type", "label"])
merged_df = merged_df.rename(columns={"component": "grapheme_component"})
merged_df

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_component
0,Train_0,15,9,5,ক্ট্রো,ক্ট
1,Train_111,15,0,0,ক্ট,ক্ট
2,Train_137,15,9,5,ক্ট্রো,ক্ট
3,Train_431,15,9,5,ক্ট্রো,ক্ট
4,Train_503,15,9,5,ক্ট্রো,ক্ট
...,...,...,...,...,...,...
200835,Train_191632,12,0,0,ঔ,ঔ
200836,Train_195613,12,0,0,ঔ,ঔ
200837,Train_196430,12,0,0,ঔ,ঔ
200838,Train_197200,12,0,0,ঔ,ঔ


In [62]:
merged_df = merged_df.merge(vowel_diacritic, how ="outer", left_on="vowel_diacritic", right_on="label")
merged_df = merged_df.drop(columns=["component_type", "label"])
merged_df = merged_df.rename(columns={"component": "vowel_diacritic_component"})
merged_df

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_component,vowel_diacritic_component
0,Train_0,15,9,5,ক্ট্রো,ক্ট,ো
1,Train_137,15,9,5,ক্ট্রো,ক্ট,ো
2,Train_431,15,9,5,ক্ট্রো,ক্ট,ো
3,Train_503,15,9,5,ক্ট্রো,ক্ট,ো
4,Train_1094,15,9,5,ক্ট্রো,ক্ট,ো
...,...,...,...,...,...,...,...
200835,Train_192215,113,10,0,ভৌ,ভ,ৌ
200836,Train_194690,113,10,0,ভৌ,ভ,ৌ
200837,Train_197446,113,10,0,ভৌ,ভ,ৌ
200838,Train_198791,113,10,0,ভৌ,ভ,ৌ


In [63]:
merged_df = merged_df.merge(consonant_diacritic, how ="outer", left_on="consonant_diacritic", right_on="label")
merged_df = merged_df.drop(columns=["component_type", "label"])
merged_df = merged_df.rename(columns={"component": "consonant_diacritic_component"})
merged_df

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_component,vowel_diacritic_component,consonant_diacritic_component
0,Train_0,15,9,5,ক্ট্রো,ক্ট,ো,্র
1,Train_137,15,9,5,ক্ট্রো,ক্ট,ো,্র
2,Train_431,15,9,5,ক্ট্রো,ক্ট,ো,্র
3,Train_503,15,9,5,ক্ট্রো,ক্ট,ো,্র
4,Train_1094,15,9,5,ক্ট্রো,ক্ট,ো,্র
...,...,...,...,...,...,...,...,...
200835,Train_196244,149,1,6,স্ট্র্যা,স্ট,া,্র্য
200836,Train_196753,149,1,6,স্ট্র্যা,স্ট,া,্র্য
200837,Train_197765,149,1,6,স্ট্র্যা,স্ট,া,্র্য
200838,Train_199265,149,1,6,স্ট্র্যা,স্ট,া,্র্য


In [None]:
merged_df