In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from collections import Counter


In [2]:
DATA_DIR = '/kaggle/input/bangla-ner/'

In [3]:
train_data = pd.read_csv(DATA_DIR+'cleaned_train.csv')
valid_data = pd.read_csv(DATA_DIR+'cleaned_valid.csv')

In [4]:
stop_words = pd.read_excel('/kaggle/input/bangla-stopwords/bangla_stopwords.xlsx')

In [5]:
stop_words.head()

Unnamed: 0.1,Unnamed: 0,word_list
0,0,অই
1,1,অগত্যা
2,2,অত: পর
3,3,অতএব
4,4,অথচ


In [6]:
train_data.head()

Unnamed: 0,entity,tag,pos
0,তার,O,pronoun
1,মৃত্যুর,O,UNK
2,দশ,O,adverb
3,দিন,O,verb
4,পর,O,adjective


In [7]:
len(train_data.loc[4,'entity'])

2

In [8]:
train_data['entity'] = train_data['entity'].astype(str)
train_data['tag'] = train_data['tag'].astype(str)
train_data['pos'] = train_data['pos'].astype(str)

# Feature-engineering
- len might be a possible feature as the difference between the lengths of the types of the entities may help
- word frequency 
- pos as the vector of UNK, pronoun, noun, verb, adverb, adjective, article, interjection, conjunction, preposition, prefix, article, abbreviation,phraseadjective/feminine

In [9]:
# the vocabulary in the training set present
word_list = train_data['entity'].values
word_count = Counter(word_list)
pos_vector = ['UNK', 'pronoun', 'noun', 'verb', 'adverb', 'adjective', 'article', 'interjection', 'conjunction', 'preposition', 'prefix', 'article', 'abbreviation','phraseadjective/feminine']

In [10]:
def get_posvector(text):
    pos_vec = np.zeros(len(pos_vector))
    idx = np.where(pos_vector == 15)
    pos_vec[idx] = 1
    return pos_vec

In [11]:
train_data['len'] = train_data['entity'].apply(lambda x: len(x))
train_data['frequency'] = train_data['entity'].apply(lambda x: word_count.get(x, 0))
train_data['pos_vector'] = train_data['entity'].apply(get_posvector)

In [12]:
train_data.head()

Unnamed: 0,entity,tag,pos,len,frequency,pos_vector
0,তার,O,pronoun,3,1602,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,মৃত্যুর,O,UNK,7,48,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,দশ,O,adverb,2,21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,দিন,O,verb,3,55,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,পর,O,adjective,2,303,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
valid_data['entity'] = valid_data['entity'].astype(str)
valid_data['tag'] = valid_data['tag'].astype(str)
valid_data['pos'] = valid_data['pos'].astype(str)

In [14]:
valid_data['len'] = valid_data['entity'].apply(lambda x: len(x))
valid_data['frequency'] = valid_data['entity'].apply(lambda x: word_count.get(x, 0))
valid_data['pos_vector'] = valid_data['entity'].apply(get_posvector)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, make_scorer

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
def encode_labels(data,subset, enc=None):
    
    if subset == 'train':
        enc = LabelEncoder()
        transformed_labels = enc.fit_transform(data.to_list())
    else: 
        transformed_labels = enc.transform(data.to_list())

    transformed_labels  = transformed_labels.reshape(-1,1)
    return transformed_labels, enc

In [18]:
train_data['tag'], label_encoder = encode_labels(train_data['tag'], 'train')

In [19]:
valid_data['tag'], _ = encode_labels(valid_data['tag'], 'valid', label_encoder)

# Identify if entities are stopwords

In [20]:
stop_words = set(stop_words['word_list'].values)

In [21]:
train_data['is_stop'] = train_data['entity'].apply(lambda x: 1 if x in stop_words else 0)

In [22]:
words , tags = train_data[['len', 'frequency', 'is_stop']].values.tolist(), train_data['tag'].values

In [23]:
cross_val_score(RandomForestClassifier(n_estimators=20),scoring=make_scorer(f1_score, average='macro'),X=words, y=tags, cv=5)

array([0.22464074, 0.2232364 , 0.2198092 , 0.2154388 , 0.22816955])

In [24]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [25]:
cross_val_score(MultinomialNB(),scoring=make_scorer(f1_score, average='macro'),X=words, y=tags, cv=5)

array([0.10975334, 0.10823476, 0.11183666, 0.11197324, 0.12689889])