In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from collections import Counter


In [2]:
DATA_DIR = '/kaggle/input/bangla-ner/'

In [3]:
train_data = pd.read_csv(DATA_DIR+'cleaned_train.csv')
valid_data = pd.read_csv(DATA_DIR+'cleaned_valid.csv')

In [4]:
train_data.head()

Unnamed: 0,entity,tag,pos
0,তার,O,pronoun
1,মৃত্যুর,O,UNK
2,দশ,O,adverb
3,দিন,O,verb
4,পর,O,adjective


In [5]:
len(train_data.loc[4,'entity'])

2

In [6]:
train_data['entity'] = train_data['entity'].astype(str)
train_data['tag'] = train_data['tag'].astype(str)
train_data['pos'] = train_data['pos'].astype(str)

# Feature-engineering
- len might be a possible feature as the difference between the lengths of the types of the entities may help
- word frequency 
- pos as the vector of UNK, pronoun, noun, verb, adverb, adjective, article, interjection, conjunction, preposition, prefix, article, abbreviation,phraseadjective/feminine

In [7]:
# the vocabulary in the training set present
word_list = train_data['entity'].values
word_count = Counter(word_list)
pos_vector = ['UNK', 'pronoun', 'noun', 'verb', 'adverb', 'adjective', 'article', 'interjection', 'conjunction', 'preposition', 'prefix', 'article', 'abbreviation','phraseadjective/feminine']

In [8]:
def get_posvector(text):
    pos_vec = np.zeros(len(pos_vector))
    idx = np.where(pos_vector == 15)
    pos_vec[idx] = 1
    return pos_vec

In [9]:
train_data['len'] = train_data['entity'].apply(lambda x: len(x))
train_data['frequency'] = train_data['entity'].apply(lambda x: word_count.get(x, 0))
train_data['pos_vector'] = train_data['entity'].apply(get_posvector)

In [10]:
train_data.head()

Unnamed: 0,entity,tag,pos,len,frequency,pos_vector
0,তার,O,pronoun,3,1602,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,মৃত্যুর,O,UNK,7,48,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,দশ,O,adverb,2,21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,দিন,O,verb,3,55,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,পর,O,adjective,2,303,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [11]:
valid_data['entity'] = valid_data['entity'].astype(str)
valid_data['tag'] = valid_data['tag'].astype(str)
valid_data['pos'] = valid_data['pos'].astype(str)

In [12]:
valid_data['len'] = valid_data['entity'].apply(lambda x: len(x))
valid_data['frequency'] = valid_data['entity'].apply(lambda x: word_count.get(x, 0))
valid_data['pos_vector'] = valid_data['entity'].apply(get_posvector)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, make_scorer

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
def encode_labels(data,subset, enc=None):
    
    if subset == 'train':
        enc = LabelEncoder()
        transformed_labels = enc.fit_transform(data.to_list())
    else: 
        transformed_labels = enc.transform(data.to_list())

    transformed_labels  = transformed_labels.reshape(-1,1)
    return transformed_labels, enc

In [16]:
train_data['tag'], label_encoder = encode_labels(train_data['tag'], 'train')

In [17]:
valid_data['tag'], _ = encode_labels(valid_data['tag'], 'valid', label_encoder)

In [18]:
words , tags = train_data[['len','frequency']].values.tolist(), train_data['tag'].values

In [19]:
cross_val_score(RandomForestClassifier(n_estimators=20),scoring=make_scorer(f1_score, average='macro'),X=words, y=tags, cv=5)

array([0.21459775, 0.21481537, 0.21296821, 0.21212955, 0.21437238])