https://www.kaggle.com/williamroe/bi-lstm-with-crf-for-ner

https://www.depends-on-the-definition.com/sequence-tagging-lstm-crf/

https://towardsdatascience.com/named-entity-recognition-ner-meeting-industrys-requirement-by-applying-state-of-the-art-deep-698d2b3b4ede

https://confusedcoders.com/data-science/deep-learning/how-to-build-deep-neural-network-for-custom-ner-with-keras

https://www.kaggle.com/nikkisharma536/ner-with-bilstm-and-crf

In [None]:
!python --version

import warnings
warnings.filterwarnings('ignore')

import csv 

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import string
from wordcloud import WordCloud

sns.set(font_scale = 1)
%matplotlib inline
plt.style.use('ggplot')

from IPython.core.pylabtools import figsize

import nltk, re, string, collections
from collections import Counter

from nltk import word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')

from math import nan

from future.utils import iteritems

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

!pip install git+https://www.github.com/keras-team/keras-contrib.git
from keras_contrib.layers import CRF
    
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k

from keras.callbacks import ModelCheckpoint

!pip install seqeval

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

! pip install sklearn_crfsuite

from  sklearn_crfsuite.metrics import flat_classification_report  

from keras.models import load_model
from keras_contrib.utils import save_load_utils

In [None]:
# # reading dump file
# ner_rows = []
# sentence_id = 0

# with open("../input/nercorpus/NER-corpus.DUMP", "r") as ner_file:
    
#     for line in ner_file:
#         label = line.split('\t')[0]
#         tags = line.split('\t')[1].split()
#         words = line.split('\t')[2].split('\n')[0].split()
#         for tag, word in zip(tags, words):
#             ner_rows.append([sentence_id, label, word.lower(), tag])
        
#         sentence_id+=1

# # dump to csv for appropriate format
# fields = ['sentence_id', 'label', 'word', 'tag']
# filename = "tbmm_ner.csv"

# with open(filename, 'w') as csvfile:
#     csvwriter = csv.writer(csvfile)
#     csvwriter.writerow(fields)
#     csvwriter.writerows(ner_rows)

In [None]:
ner_df = pd.read_csv("../input/tbmm-ner/tbmm_ner.csv", encoding = "utf-8", error_bad_lines=False)
ner_df = ner_df.fillna(method="ffill")
ner_df.info()
ner_df.head(10)

# > **Data Observe**

In [None]:
ner_words = [i.lower() for i in ner_df['word'] if re.findall("^[a-zA-Z0-9ğüşöçİĞÜŞÖÇ]+$", i) and len(i) > 1]
ner_words = [i for i in ner_words if i not in stopwords.words('turkish')]

In [None]:
c = Counter(ner_words)
c = list(c.most_common(500))
most_common = []
for i in range(len(c)):
    most_common.append(c[i][0])

In [None]:
wordcloud = WordCloud(background_color="white",width=1000, height=600, max_font_size = 80).generate(' '.join(most_common))
plt.figure(figsize=(40,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
ner_df['tag'].value_counts()

In [None]:
figsize(20, 10)
sns.countplot(ner_df['tag'], palette="colorblind");
plt.xlabel('Tags'); 

# > **Data Formatting**

In [None]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(l, w, t) for l, w,t in zip(s["label"].values.tolist(),
                                                          s["word"].values.tolist(),
                                                          s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
           
getter = SentenceGetter(ner_df)
sentences = getter.sentences

In [None]:
labels = []
for label in set(ner_df["label"].values):
    if label is nan or isinstance(label, float):
        labels.append('unk')
    else:
        labels.append(label)
n_labels = len(labels)

words = list(set(ner_df["word"].values))
words.append("unk")
n_words = len(words)

tags = []
for tag in set(ner_df["tag"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('unk')
    else:
        tags.append(tag)
n_tags = len(tags)

In [None]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sentence length:', maxlen)

print ('The histogram of the lengths of sentences')
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

In [None]:
label2idx = {l: i for i, l in enumerate(labels)}
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

idx2label = {v: k for k, v in iteritems(label2idx)}
idx2word = {v: k for k, v in iteritems(word2idx)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}

# > **Data Splitting as Train and Test Sets**

In [None]:
X = [[word2idx[w[1]] for w in s] for s in sentences]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
np.array(X).shape
# print(sentences[50])
# print(y[50])

In [None]:
print("X_train", X_train[2])
print("X_test", X_test[2])

print("y_train", list(y_train[2]))
print("y_test", y_test[2])

# > **BI-LSTM and CRF Model & Traning of Model**

In [None]:
input = Input(shape=(50,))
word_embedding_size = 150

# Embedding Layer
model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=50)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  

# CRF Layer
crf = CRF(n_tags)
out = crf(model)

model = Model(input, out)

In [None]:
# Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])

model.summary()

# Saving the best model only
filepath="ner-bi-lstm-td-model-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

# Fit the best model
history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=20, validation_split=0.1, verbose=1, callbacks=callbacks_list)

# > **Evaluation of the Results**

In [None]:
# Plot the graph 
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

In [None]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
            out_i.append(idx2word[p_i])
            out_i.append(idx2tag[p_i])
            
        out.append(out_i)
    return out

test_pred = model.predict(X_test, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)

In [None]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
TP = {}
TN = {}
FP = {}
FN = {}
for tag in tag2idx.keys():
    TP[tag] = 0
    TN[tag] = 0    
    FP[tag] = 0    
    FN[tag] = 0    

def accumulate_score_by_tag(gt, pred):
    if gt == pred:
        TP[gt] += 1
    elif gt != 'O' and pred == 'O':
        FN[gt] +=1
    elif gt == 'O' and pred != 'O':
        FP[gt] += 1
    else:
        TN[gt] += 1

In [None]:
for tag in tag2idx.keys():
    print(f'tag:{tag}')    
    print('\t TN:{:10}\tFP:{:10}'.format(TN[tag],FP[tag]))
    print('\t FN:{:10}\tTP:{:10}'.format(FN[tag],TP[tag]))    

In [None]:
print(classification_report(test_labels, pred_labels))

In [None]:
i = 48
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
gt = np.argmax(y_test[i], axis=-1)
print("{:14}: {:5}: {}".format("Word\t", "True\t", "Pred\t"))
for idx, (w,pred) in enumerate(zip(X_test[i],p[0])):
    if words[w] != "unk":
        print("{:14}: {:5}: \t{}".format(words[w],idx2tag[gt[idx]],tags[pred]))

# > **Demo Screen**

In [None]:
def readFile(fileName):
    file = open(fileName, "r")
    sentences = file.read()
    file.close
    
    print("\nYour file --> \t", sentences)
    
    words = []
    
    sentence_id = 0
    
    for s in sentences.split('.'):
        for w in s.split():
            words.append([sentence_id, w.lower()])
        sentence_id+=1
    
    return words  

In [None]:
def load_model():
    input = Input(shape=(50,))
    word_embedding_size = 150

    # Embedding Layer
    model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=50)(input)

    # BI-LSTM Layer
    model = Bidirectional(LSTM(units=word_embedding_size, 
                               return_sequences=True, 
                               dropout=0.5, 
                               recurrent_dropout=0.5, 
                               kernel_initializer=k.initializers.he_normal()))(model)
    model = LSTM(units=word_embedding_size * 2, 
                 return_sequences=True, 
                 dropout=0.5, 
                 recurrent_dropout=0.5, 
                 kernel_initializer=k.initializers.he_normal())(model)

    # TimeDistributed Layer
    model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  

    # CRF Layer
    crf = CRF(n_tags)
    out = crf(model)

    model = Model(input, out)

    model.load_weights('../input/ner-bi-lstm-crf-model/ner-bi-lstm-td-model-0.97.hdf5')
    
    return model

In [None]:
# Loading saved model
model = load_model()

# Read demo file
words = readFile("../input/tbmmcorpusdonem2027/NLP/donem20/yıl4/11.txt")
print("\n\nAfter formating --> \t", words[0])

maxlen = max([len(w) for w in words])
n_words = len(words)

X = [word2idx[w[1]] for w in words]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)

print("\n\nAfter word to idx --> \t", X[0])

# i = 48
# p = model.predict(np.array([X[i]]))
# p = np.argmax(p, axis=-1)
# gt = np.argmax(y_test[i], axis=-1)
# print("{:14}: {:5}: {}".format("Word\t", "True\t", "Pred\t"))
# for idx, (w,pred) in enumerate(zip(X_test[i],p[0])):
#     if words[w] != "unk":
#         print("{:14}: {:5}: \t{}".format(words[w],idx2tag[gt[idx]],tags[pred]))