In [114]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.model_selection import train_test_split

import nltk
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [115]:
train_dataset = pd.read_csv('/kaggle/input/bangla-ner/cleaned_train.csv')
train_dataset.head()

Unnamed: 0,entity,tag,pos
0,তার,O,pronoun
1,মৃত্যুর,O,UNK
2,দশ,O,adverb
3,দিন,O,verb
4,পর,O,adjective


In [116]:
valid_dataset = pd.read_csv('/kaggle/input/bangla-ner/cleaned_valid.csv')
valid_dataset.head()

Unnamed: 0,entity,tag,pos
0,তিনি,O,pronoun
1,যুবক,O,adverb
2,হিসেবে,O,adverb
3,শেফিল্ড,B-GRP,UNK
4,বুধবার,I-GRP,adverb


# Preprocessing

Preprocess training dataset

In [117]:
# def preprocess_dataset(dataframe):
#     dataframe = dataframe.apply(remove_punctuation).apply(tokenize_text).apply(clean_words)
#     return dataframe

In [118]:
#train_dataset['content'] = preprocess_dataset(train_dataset['content'])

In [119]:
#train_dataset.head()

# Encode the labels


In [120]:
def encode_labels(data,subset, enc=None):
    
    if subset == 'train':
        enc = LabelEncoder()
        transformed_labels = enc.fit_transform(data.to_list())
    else: 
        transformed_labels = enc.transform(data.to_list())

    transformed_labels  = transformed_labels.reshape(-1,1)
    return transformed_labels, enc


In [121]:
train_dataset['tag'], label_encoder = encode_labels(train_dataset['tag'], 'train')

In [122]:
train_dataset.head()

Unnamed: 0,entity,tag,pos
0,তার,12,pronoun
1,মৃত্যুর,12,UNK
2,দশ,12,adverb
3,দিন,12,verb
4,পর,12,adjective


## Preprocess valid dataset

In [123]:
#valid_dataset['content'] = preprocess_dataset(valid_dataset['content'])

In [124]:
valid_dataset['tag'], _ = encode_labels(valid_dataset['tag'], 'valid', label_encoder)

In [125]:
valid_dataset.head()

Unnamed: 0,entity,tag,pos
0,তিনি,12,pronoun
1,যুবক,12,adverb
2,হিসেবে,12,adverb
3,শেফিল্ড,2,UNK
4,বুধবার,8,adverb


In [126]:
with open('/kaggle/input/bangla-ner/word_index.pkl', 'rb') as handle:
    word_index = pickle.load(handle)

In [127]:
vocab_size = len(set(word_index.keys())) + 1

In [128]:
#vocab_index = {word:i+1 for i, word in enumerate(vocab_list)}

In [129]:
def encode_text(token):
    if token in word_index:
        return word_index[token]
    else:
        return 1

In [130]:
train_dataset['entity'] = train_dataset['entity'].apply(encode_text)
valid_dataset['entity'] = valid_dataset['entity'].apply(encode_text)

In [131]:
train_dataset.fillna(0, inplace=True)
valid_dataset.fillna(0, inplace=True)

In [132]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [133]:
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=1))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=[f1])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1, 100)            2635600   
_________________________________________________________________
flatten_3 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1010      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 2,636,621
Trainable params: 2,636,621
Non-trainable params: 0
_________________________________________________________________


In [134]:
# history = model.fit(train_dataset['entity'].values, train_dataset['tag'].values,
#                     epochs=10,
#                     verbose=True,
#                     validation_data=(valid_dataset['entity'].values, valid_dataset['tag'].values),
#                     batch_size=10)


In [135]:
from tensorflow import keras

In [136]:
model = keras.models.load_model('/kaggle/input/bangla-ner/sequential_model.h5')

ValueError: Unknown metric function: f1. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.

In [None]:
#model.load('/kaggle/input/bangla-ner/sequential_model.h5')

In [None]:
non_alpha = set('''````£|¢|Ñ+-*/=EROero৳•।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰🤣⚽️✌�￰৷￰''')

In [None]:
def clean_entity(text):
    found_nonalphas = set()
    if text:
        ans = ''
        for ent in text:
            if ent not in non_alpha:
                ans += ent
        return ans
    else:
        return np.nan

In [None]:
test_data[0] = test_data[0].astype(str)

In [None]:
test_data[0] = test_data[0].apply(clean_entity)

In [None]:
# loss, accuracy = model.evaluate(train_dataset['entity'].values, train_dataset['tag'].values, verbose=False)
# print("Training Accuracy: {:.4f}".format(accuracy))
# loss, accuracy = model.evaluate(train_dataset['entity'].values, train_dataset['tag'].values, verbose=False)
# print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
#model.save('/kaggle/working/sequential_model.h5')

In [None]:
test_data = pd.read_csv('/kaggle/input/bangla-ner/test.txt', sep=" ", header=None, dtype='str', skip_blank_lines=False)

In [None]:
pred = model.predict(test_data[0].values)