# Updates from last version  

* 27/07 - Initial version  

## Import required libraries

In [1]:
%env PYTHONHASHSEED=0

env: PYTHONHASHSEED=0


In [2]:
import pandas as pd
import collections

import numpy as np
import pickle
import re
import string
#from sklearn.preprocessing import OneHotEncoder



## All configurations

In [3]:
w2v_model_path = '../models/glove.840B.300d/'
moldel_outout_path = '../models_improved2/'
training_iob_tagged_file = '../data/CoNLL-2003/eng.train'
validation_iob_tagged_file = '../data/CoNLL-2003/eng.testa'
test_iob_tagged_file = '../data/CoNLL-2003/eng.testb'
save_best_weights = '../models_improved2/ner/bi_lstm_crf_improved_weights.h5'
save_end_weights = '../models_improved2/ner/bi_lstm_crf_improved_last_epoch_weights.h5'

## Read embedding

In [4]:
# Load GloVe's embeddings
embeddings_index = {}
with open(w2v_model_path + '/glove.840B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 2196016


## Read input files

In [5]:
# delemeter is set to blank and keep blank files
train_iob_tagged_df = pd.read_csv(training_iob_tagged_file,delimiter=' ',skip_blank_lines=False, 
                                  header = None, names = ['word','pos','chunk','tag'])
train_iob_tagged_df.head(10)

Unnamed: 0,word,pos,chunk,tag
0,-DOCSTART-,-X-,O,O
1,,,,
2,EU,NNP,I-NP,I-ORG
3,rejects,VBZ,I-VP,O
4,German,JJ,I-NP,I-MISC
5,call,NN,I-NP,O
6,to,TO,I-VP,O
7,boycott,VB,I-VP,O
8,British,JJ,I-NP,I-MISC
9,lamb,NN,I-NP,O


In [6]:
train_iob_tagged_df.isna().sum()

word     14990
pos      14987
chunk    14987
tag      17165
dtype: int64

In [7]:
# data pre_processing function
def pre_process_data_ner(iob_tagged_df):
    # get the ids for blank rows
    sent_idx = iob_tagged_df.index[iob_tagged_df.isnull().all(axis=1)].tolist()
    #sent_idx[0:5]

    # create sentence number list
    sentence_num_list = [-1] # only for first row
    e_last = 0
    i_last = 0
    for i, (s, e) in enumerate(zip(sent_idx[:-1],sent_idx[1:])):
        sentence_num_list.extend([i for _ in range(s,e)])
        e_last = e
        i_last = i

    # last few rows after last empty row
    sentence_num_list.extend([(i_last + 1) for _ in range(e_last, iob_tagged_df.shape[0])])
    #len(sentence_num_list),iob_tagged_df.shape

    # add sentence number to data frame
    iob_tagged_df['sentence_num'] = sentence_num_list
    #iob_tagged_df.head()

    # get the ids for -DOCSTART- rows
    doc_idx = iob_tagged_df.index[iob_tagged_df['word']=='-DOCSTART-'].tolist()
    doc_idx[0:5]

    # create document number list
    doc_num_list = [] 
    for i, (s, e) in enumerate(zip(doc_idx[:-1],doc_idx[1:])):
        doc_num_list.extend([i for _ in range(s,e)])

    # last few rows after last empty row
    doc_num_list.extend([(i + 1) for _ in range(e, iob_tagged_df.shape[0])])
    #len(doc_num_list),iob_tagged_df.shape

    # add document number to data frame
    iob_tagged_df['doc_num'] = doc_num_list
    #iob_tagged_df.tail()

    # delete all blank and doc start rows
    delete_rows = sent_idx + doc_idx
    print(len(sent_idx), len(doc_idx))
    iob_tagged_df_cleaned = iob_tagged_df.drop(iob_tagged_df.index[delete_rows])
    iob_tagged_df_cleaned.tail()

    print(iob_tagged_df_cleaned.isna().sum())
    print(iob_tagged_df_cleaned.info())

    # remove rows with null values in any of the word column
    iob_tagged_df_cleaned['word'] = iob_tagged_df_cleaned['word'].replace(' ', np.nan)
    iob_tagged_df_cleaned = iob_tagged_df_cleaned.dropna(axis=0, subset=['word'])
    
    return iob_tagged_df_cleaned

In [8]:
train_iob_tagged_df_cleaned = pre_process_data_ner(train_iob_tagged_df)
train_iob_tagged_df_cleaned.isna().sum()

14987 946
word               3
pos                0
chunk              0
tag             2178
sentence_num       0
doc_num            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 203621 entries, 2 to 219552
Data columns (total 6 columns):
word            203618 non-null object
pos             203621 non-null object
chunk           203621 non-null object
tag             201443 non-null object
sentence_num    203621 non-null int64
doc_num         203621 non-null int64
dtypes: int64(2), object(4)
memory usage: 10.9+ MB
None


word            0
pos             0
chunk           0
tag             0
sentence_num    0
doc_num         0
dtype: int64

In [9]:
train_iob_tagged_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201440 entries, 2 to 219552
Data columns (total 6 columns):
word            201440 non-null object
pos             201440 non-null object
chunk           201440 non-null object
tag             201440 non-null object
sentence_num    201440 non-null int64
doc_num         201440 non-null int64
dtypes: int64(2), object(4)
memory usage: 10.8+ MB


In [10]:
train_tag_counts = collections.Counter(train_iob_tagged_df_cleaned["tag"])
train_tag_counts

Counter({'I-ORG': 10001,
         'O': 167397,
         'I-MISC': 4556,
         'I-PER': 11128,
         'I-LOC': 8286,
         'B-LOC': 11,
         'B-MISC': 37,
         'B-ORG': 24})

In [11]:
len(train_iob_tagged_df_cleaned.doc_num.unique()), \
len(train_iob_tagged_df_cleaned.sentence_num.unique()), \
train_iob_tagged_df_cleaned.shape[0]

(946, 14039, 201440)

## IOB to IOB2

In [12]:
# function to convert IOB1 to IOB2 format for NER and chunk tags
def iob_to_iob2(iob_tagged_df, convert_tag = 'tag', sentence_id = 'sentence_num'):
    prev_sentence = -1
    iob2_tag = []
    for _, row in iob_tagged_df.iterrows():
        cur_sentence = row[sentence_id]
        if cur_sentence != prev_sentence:
            prev_tag_ent = 'O'
            prev_chunk ='O'

        cur_tag = row[convert_tag]
        if cur_tag == 'O':
            iob2_tag.append('O')
            cur_tag_ent = 'O'
        else:
            cur_tag_ent = cur_tag.split('-')[1]
            if prev_tag_ent != cur_tag_ent:
                iob2_tag.append('B-'+ cur_tag_ent)
            else:
                iob2_tag.append(cur_tag)

        prev_tag_ent = cur_tag_ent
        prev_sentence = cur_sentence
    return iob2_tag


    

In [13]:
train_iob_tagged_df_cleaned["iob2_tag"] = iob_to_iob2(train_iob_tagged_df_cleaned)
train_iob_tagged_df_cleaned["iob2_chunk"] = iob_to_iob2(train_iob_tagged_df_cleaned,'chunk')
train_iob_tagged_df_cleaned.head()

Unnamed: 0,word,pos,chunk,tag,sentence_num,doc_num,iob2_tag,iob2_chunk
2,EU,NNP,I-NP,I-ORG,0,0,B-ORG,B-NP
3,rejects,VBZ,I-VP,O,0,0,O,B-VP
4,German,JJ,I-NP,I-MISC,0,0,B-MISC,B-NP
5,call,NN,I-NP,O,0,0,O,I-NP
6,to,TO,I-VP,O,0,0,O,B-VP


In [14]:
train_iob_tagged_df_cleaned.drop(['chunk','tag'], axis = 1, inplace = True)
train_iob_tagged_df_cleaned.rename(columns = {'iob2_tag':'tag',
                                              'iob2_chunk':'chunk'}, inplace = True)

## Reduce sentences

In [15]:
train_iob_tagged_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201440 entries, 2 to 219552
Data columns (total 6 columns):
word            201440 non-null object
pos             201440 non-null object
sentence_num    201440 non-null int64
doc_num         201440 non-null int64
tag             201440 non-null object
chunk           201440 non-null object
dtypes: int64(2), object(4)
memory usage: 10.8+ MB


In [16]:
# statistics on words in sentences
sent_word_stat = train_iob_tagged_df_cleaned.groupby('sentence_num').size().reset_index()
sent_word_stat.loc[:,0].describe()

count    14039.000000
mean        14.348600
std         11.448242
min          1.000000
25%          6.000000
50%         10.000000
75%         22.000000
max        113.000000
Name: 0, dtype: float64

In [17]:
def reduce_sentence_length_by_word(df, max_word, sentence_id_col_name):
    sent_word_stat = df.groupby(sentence_id_col_name).size().reset_index()
    # keep sentences that less than or equal to (appproximately) 3rd quartile of number of words
    thirdq_list_sentence_id = list(sent_word_stat.loc[sent_word_stat[0] <= max_word,sentence_id_col_name])
    df_reduced = df.loc[df.sentence_num.isin(thirdq_list_sentence_id),:]
    return df_reduced

In [18]:
# # keep sentences that less than or equal to (appproximately) 3rd quartile of number of words
# thirdq_list_sentence_id = list(sent_word_stat.loc[sent_word_stat[0] <= 25,'sentence_num'])
# len(thirdq_list_sentence_id)

In [19]:
# train_iob_tagged_df_cleaned_reduced = train_iob_tagged_df_cleaned.loc[train_iob_tagged_df_cleaned.sentence_num.\
#                                                                   isin(thirdq_list_sentence_id),:]
# train_iob_tagged_df_cleaned_reduced.shape

In [20]:
max_words = 25
train_iob_tagged_df_cleaned_reduced = reduce_sentence_length_by_word(train_iob_tagged_df_cleaned,
                                                                     max_words,'sentence_num')
train_iob_tagged_df_cleaned_reduced.shape

(108975, 6)

Let's reduce the number pf sentences based on tag density

In [21]:
# sent_tag_stat_raw = train_iob_tagged_df_cleaned_reduced.copy()
# sent_tag_stat_raw['tag_type'] = sent_tag_stat_raw['tag'].apply(lambda x: x[:2] if x != 'O' else 'O')
# sent_tag_stat_raw.head()

In [22]:
# # keep onlly the words having B- tags
# sent_tag_stat_raw = sent_tag_stat_raw.loc[sent_tag_stat_raw.tag_type == 'B-',:]
# # statistics on tags in sentences
# sent_tag_stat = sent_tag_stat_raw.groupby('sentence_num').size().reset_index()
# sent_tag_stat.loc[:,0].describe()

In [23]:
# # arrange sentences by number of tags
# sent_tag_stat = sent_tag_stat.sort_values([0],ascending = False).reset_index()
# sent_tag_stat.head()

In [24]:
# # keep 90% of the sentences based on number of tags
# selected_sentences_by_tagno = sent_tag_stat.iloc[range(round(len(sent_tag_stat)*0.9)),:]['sentence_num']
# len(selected_sentences_by_tagno)

In [25]:
#train_iob_tagged_df_cleaned_reduced = train_iob_tagged_df_cleaned_reduced.loc[train_iob_tagged_df_cleaned_reduced.sentence_num.\
#                                                                   isin(selected_sentences_by_tagno),:]
# train_iob_tagged_df_cleaned_reduced.shape

In [26]:
train_tag_counts = collections.Counter(train_iob_tagged_df_cleaned_reduced["tag"])
train_tag_counts

Counter({'B-ORG': 4484,
         'O': 87243,
         'B-MISC': 1716,
         'B-PER': 4283,
         'I-PER': 2998,
         'B-LOC': 4789,
         'I-ORG': 2168,
         'I-LOC': 590,
         'I-MISC': 704})

In [27]:
tags = list(set(train_iob_tagged_df_cleaned_reduced['tag'].values))
n_tags = len(tags)
tags

['I-MISC', 'B-PER', 'B-MISC', 'I-ORG', 'B-LOC', 'I-LOC', 'O', 'B-ORG', 'I-PER']

In [28]:
n_tags

9

In [29]:
len(train_iob_tagged_df_cleaned_reduced.doc_num.unique()), \
len(train_iob_tagged_df_cleaned_reduced.sentence_num.unique()), \
train_iob_tagged_df_cleaned_reduced.shape[0]

(946, 11289, 108975)

## Limit Vocab

In [30]:
# reduction of Vocab wil be based on original data set, nt reduced
word_counts = collections.Counter(train_iob_tagged_df_cleaned["word"])

In [31]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

# Define threshold
threshold = 10

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

In [32]:
# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total Number of Unique Words:", len(word_counts))
print("Number of Words we will use:", len(vocab_to_int))
print("Percent of Words we will use: {}%".format(usage_ratio))

Total Number of Unique Words: 23621
Number of Words we will use: 21399
Percent of Words we will use: 90.59%


In [33]:
with open(moldel_outout_path + '/ner_bilstm_crf_improved_train_vocab2int.pkl','wb') as _f:
    pickle.dump(vocab_to_int,_f,protocol = pickle.HIGHEST_PROTOCOL)

In [34]:
# Need to use 300 for embedding dimensions to match GloVe's vectors.
embedding_dim = 300

random_embd_count = 0
nb_words = len(vocab_to_int)
# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in GloVe, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding
        random_embd_count += 1

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix), random_embd_count)

21399 16


## Prepare for NER training

In [35]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import make_pipeline

Using TensorFlow backend.


In [36]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, c, t) for w, p, c, t in zip(s["word"].values.tolist(),
                                                           s["pos"].values.tolist(),
                                                            s["chunk"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_num").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [37]:
# using sentences from reduced data set
train_getter = SentenceGetter(train_iob_tagged_df_cleaned_reduced)
train_sentences = train_getter.sentences

In [38]:
labels = [[s[3] for s in sent] for sent in train_sentences]
sentences = [" ".join([str(s[0]) for s in sent]) for sent in train_sentences]
sentences[0]

'EU rejects German call to boycott British lamb .'

In [39]:
len(sentences[0])

48

In [40]:
print(labels[0])

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [41]:
# Find the length of sentences
lengths = [len(sent.split()) for sent in sentences]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [42]:
lengths.counts.describe()

count    11289.000000
mean         9.653202
std          6.370189
min          1.000000
25%          5.000000
50%          8.000000
75%         13.000000
max         25.000000
Name: counts, dtype: float64

In [43]:
max_len = 25
tag2idx = {t: i for i, t in enumerate(tags)}
tag2idx

{'I-MISC': 0,
 'B-PER': 1,
 'B-MISC': 2,
 'I-ORG': 3,
 'B-LOC': 4,
 'I-LOC': 5,
 'O': 6,
 'B-ORG': 7,
 'I-PER': 8}

In [44]:
tags

['I-MISC', 'B-PER', 'B-MISC', 'I-ORG', 'B-LOC', 'I-LOC', 'O', 'B-ORG', 'I-PER']

In [45]:
with open(moldel_outout_path + '/ner_bilstm_crf_improved_tag2idx.pkl','wb') as _f:
    pickle.dump(tag2idx,_f,protocol = pickle.HIGHEST_PROTOCOL)

In [46]:
idx2tag = {i: w for w, i in tag2idx.items()}
idx2tag

{0: 'I-MISC',
 1: 'B-PER',
 2: 'B-MISC',
 3: 'I-ORG',
 4: 'B-LOC',
 5: 'I-LOC',
 6: 'O',
 7: 'B-ORG',
 8: 'I-PER'}

In [47]:
with open(moldel_outout_path + '/ner_bilstm_crf_improved_idx2tag.pkl','wb') as _f:
    pickle.dump(idx2tag,_f,protocol = pickle.HIGHEST_PROTOCOL)

In [48]:
X = [[vocab_to_int[w[0]] if w[0] in vocab_to_int.keys() else vocab_to_int['<UNK>'] for w in s]\
                                                            for s in train_sentences]
X = pad_sequences(maxlen = max_len, sequences = X, padding  = 'post', value = vocab_to_int['<PAD>'])
len(X[2])

25

In [49]:
y = [[tag2idx[w[3]] for w in s] for s in train_sentences]
y = pad_sequences(maxlen = max_len, sequences = y, padding  = 'post', value = tag2idx['O'])

In [50]:
y = [to_categorical(i, num_classes = n_tags) for i in y]

In [51]:
X_train, X_val, y_train, y_val, train_idx, val_idx = train_test_split(X, y, range(len(X)), test_size = 0.1)

In [52]:
X_train.shape,len(y_train)

((10160, 25), 10160)

# NER Model setup

In [53]:
from keras import initializers
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional, concatenate,\
                            BatchNormalization
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras_contrib.layers import CRF
from seqeval.callbacks import F1Metrics

## Hyper parameters

In [54]:
dropout = 0.1
weights = initializers.TruncatedNormal(mean = 0.0, stddev = 0.1, seed = 2)
num_batch_size = 64
num_epochs = 20

### Metric Function

In [55]:
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## Model

In [56]:
import numpy as np
import tensorflow as tf
import random as python_random

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
python_random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.set_random_seed(1234)

In [57]:
main_input = Input(shape=(max_len,),name = 'main_input')
embed_layer = Embedding(input_dim=nb_words, output_dim=embedding_dim, input_length=max_len,
                 mask_zero = True,
                 weights = [word_embedding_matrix])(main_input)
#drpout1 = SpatialDropout1D(0.1)(embed_layer)
bi_lstm_layer = Bidirectional(LSTM(units=50, return_sequences=True,
                                   kernel_initializer = weights,
                                   recurrent_dropout=0.1))(embed_layer)
dense_layer = TimeDistributed(Dense(50, activation="relu"))(bi_lstm_layer)

# aux_input = Input(shape = (max_len,aux_train_input.shape[2],), name = 'aux_inputs')
# merge_out = concatenate([dense_layer,aux_input])
# #norm_out = BatchNormalization()(merge_out)

crf = CRF(n_tags, learn_mode='marginal')  # CRF layer
final_out = crf(dense_layer)  # output

model = Model(inputs = main_input, outputs = final_out)
model.compile(optimizer="adam", #"rmsprop",
              loss=crf.loss_function,
              metrics=[crf.accuracy, get_f1])

model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 25)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 25, 300)           6419700   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 100)           140400    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 25, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 25, 9)             558       
Total params: 6,565,708
Trainable params: 6,565,708
Non-trainable params: 0
_________________________________________________________________


## Training

In [58]:
callbacks = [ModelCheckpoint(save_best_weights, monitor = 'val_loss', save_best_only = True),
            EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 1, mode = 'auto'),
            ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, verbose = 1, patience = 3)]
            #F1Metrics(idx2tag)]

history = model.fit(X_train,
                    np.array(y_train),
                    batch_size = num_batch_size, 
                    epochs = num_epochs,
                    validation_split=0.1, 
                    verbose = True,
                    shuffle = True,
                   callbacks = callbacks)

Train on 9144 samples, validate on 1016 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping


In [59]:
model.save(save_end_weights)

## Validation

In [60]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [61]:
val_pred = model.predict(X_val, verbose=1)



In [62]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
val_pred_labels = pred2label(val_pred)
val_labels = pred2label(y_val)

In [63]:
print("F1-score: {:.1%}".format(f1_score(val_labels, val_pred_labels)))
print(classification_report(val_labels, val_pred_labels))

F1-score: 89.8%
           precision    recall  f1-score   support

      ORG       0.85      0.88      0.87       410
      LOC       0.95      0.94      0.94       451
      PER       0.91      0.93      0.92       406
     MISC       0.79      0.82      0.81       171

micro avg       0.89      0.90      0.90      1438
macro avg       0.89      0.90      0.90      1438



## Hold out dataset - A

In [64]:
validate_iob_tag_df = pd.read_csv(validation_iob_tagged_file,delimiter=' ',skip_blank_lines=False, 
                                  header = None, names = ['word','pos','chunk','tag'])
validate_iob_tag_df.head()

Unnamed: 0,word,pos,chunk,tag
0,-DOCSTART-,-X-,O,O
1,,,,
2,CRICKET,NNP,I-NP,O
3,-,:,O,O
4,LEICESTERSHIRE,NNP,I-NP,I-ORG


In [65]:
validate_iob_tagged_df_cleaned = pre_process_data_ner(validate_iob_tag_df)
validate_iob_tagged_df_cleaned.isna().sum()

3467 216
word              3
pos               0
chunk             0
tag             640
sentence_num      0
doc_num           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51362 entries, 2 to 55042
Data columns (total 6 columns):
word            51359 non-null object
pos             51362 non-null object
chunk           51362 non-null object
tag             50722 non-null object
sentence_num    51362 non-null int64
doc_num         51362 non-null int64
dtypes: int64(2), object(4)
memory usage: 2.7+ MB
None


word            0
pos             0
chunk           0
tag             0
sentence_num    0
doc_num         0
dtype: int64

In [66]:
len(validate_iob_tagged_df_cleaned.doc_num.unique()), \
len(validate_iob_tagged_df_cleaned.sentence_num.unique()), \
validate_iob_tagged_df_cleaned.shape[0]

(216, 3250, 50719)

In [67]:
validate_tag_counts = collections.Counter(validate_iob_tagged_df_cleaned["tag"])
validate_tag_counts

Counter({'O': 42120,
         'I-ORG': 2092,
         'I-LOC': 2094,
         'I-MISC': 1264,
         'I-PER': 3145,
         'B-MISC': 4})

In [68]:
validate_iob_tagged_df_cleaned["iob2_tag"] = iob_to_iob2(validate_iob_tagged_df_cleaned)
validate_iob_tagged_df_cleaned["iob2_chunk"] = iob_to_iob2(validate_iob_tagged_df_cleaned,'chunk')
validate_iob_tagged_df_cleaned.head()

Unnamed: 0,word,pos,chunk,tag,sentence_num,doc_num,iob2_tag,iob2_chunk
2,CRICKET,NNP,I-NP,O,0,0,O,B-NP
3,-,:,O,O,0,0,O,O
4,LEICESTERSHIRE,NNP,I-NP,I-ORG,0,0,B-ORG,B-NP
5,TAKE,NNP,I-NP,O,0,0,O,I-NP
6,OVER,IN,I-PP,O,0,0,O,B-PP


In [69]:
validate_iob_tagged_df_cleaned.drop(['chunk','tag'], axis = 1, inplace = True)
validate_iob_tagged_df_cleaned.rename(columns = {'iob2_tag':'tag',
                                              'iob2_chunk':'chunk'}, inplace = True)

In [70]:
validate_iob_tagged_df_cleaned_reduced = reduce_sentence_length_by_word(validate_iob_tagged_df_cleaned,
                                                                     max_words,'sentence_num')
validate_iob_tagged_df_cleaned_reduced.shape

(24988, 6)

In [71]:
len(validate_iob_tagged_df_cleaned_reduced.doc_num.unique()), \
len(validate_iob_tagged_df_cleaned_reduced.sentence_num.unique()), \
validate_iob_tagged_df_cleaned_reduced.shape[0]

(216, 2505, 24988)

In [72]:
validate_tag_counts = collections.Counter(validate_iob_tagged_df_cleaned_reduced["tag"])
validate_tag_counts

Counter({'O': 20121,
         'B-ORG': 839,
         'B-LOC': 1180,
         'B-PER': 1008,
         'I-PER': 675,
         'B-MISC': 460,
         'I-MISC': 193,
         'I-LOC': 156,
         'I-ORG': 356})

In [73]:
validate_getter = SentenceGetter(validate_iob_tagged_df_cleaned_reduced)
validate_sentences = validate_getter.sentences

In [74]:
X_validate = [[vocab_to_int[w[0]] if w[0] in vocab_to_int.keys() else vocab_to_int['<UNK>'] for w in s]\
                                                            for s in validate_sentences]
X_validate = pad_sequences(maxlen = max_len, sequences = X_validate, padding  = 'post', value = vocab_to_int['<PAD>'])
len(X_validate[0])

25

In [75]:
y_validate = [[tag2idx[w[3]] for w in s] for s in validate_sentences]
y_validate = pad_sequences(maxlen = max_len, sequences = y_validate, padding  = 'post', value = tag2idx['O'])

In [76]:
y_validate = [to_categorical(i, num_classes = n_tags) for i in y_validate]

In [77]:
validate_pred = model.predict(X_validate, verbose=1)



In [78]:
validate_pred_labels = pred2label(validate_pred)
validate_labels = pred2label(y_validate)

In [79]:
print("F1-score: {:.1%}".format(f1_score(validate_labels, validate_pred_labels)))
print(classification_report(validate_labels, validate_pred_labels))

F1-score: 82.5%
           precision    recall  f1-score   support

      ORG       0.74      0.80      0.77       839
      PER       0.80      0.84      0.82      1008
     MISC       0.74      0.80      0.77       460
      LOC       0.94      0.85      0.90      1180

micro avg       0.82      0.83      0.83      3487
macro avg       0.83      0.83      0.83      3487



## Hold out dataset - B

In [80]:
test_iob_tag_df = pd.read_csv(test_iob_tagged_file,delimiter=' ',skip_blank_lines=False, 
                                  header = None, names = ['word','pos','chunk','tag'])
test_iob_tag_df.head()

Unnamed: 0,word,pos,chunk,tag
0,-DOCSTART-,-X-,-X-,O
1,,,,
2,SOCCER,NN,I-NP,O
3,-,:,O,O
4,JAPAN,NNP,I-NP,I-LOC


In [81]:
test_iob_tagged_df_cleaned = pre_process_data_ner(test_iob_tag_df)
test_iob_tagged_df_cleaned.isna().sum()

3685 231
word              0
pos               0
chunk             0
tag             421
sentence_num      0
doc_num           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 46435 entries, 2 to 50348
Data columns (total 6 columns):
word            46435 non-null object
pos             46435 non-null object
chunk           46435 non-null object
tag             46014 non-null object
sentence_num    46435 non-null int64
doc_num         46435 non-null int64
dtypes: int64(2), object(4)
memory usage: 2.5+ MB
None


word            0
pos             0
chunk           0
tag             0
sentence_num    0
doc_num         0
dtype: int64

In [82]:
len(test_iob_tagged_df_cleaned.doc_num.unique()), \
len(test_iob_tagged_df_cleaned.sentence_num.unique()), \
test_iob_tagged_df_cleaned.shape[0]

(231, 3453, 46014)

In [83]:
test_tag_counts = collections.Counter(test_iob_tagged_df_cleaned["tag"])
test_tag_counts

Counter({'O': 37902,
         'I-LOC': 1919,
         'I-PER': 2773,
         'I-MISC': 909,
         'I-ORG': 2491,
         'B-ORG': 5,
         'B-MISC': 9,
         'B-LOC': 6})

In [84]:
test_iob_tagged_df_cleaned["iob2_tag"] = iob_to_iob2(test_iob_tagged_df_cleaned)
test_iob_tagged_df_cleaned["iob2_chunk"] = iob_to_iob2(test_iob_tagged_df_cleaned,'chunk')
test_iob_tagged_df_cleaned.head()

Unnamed: 0,word,pos,chunk,tag,sentence_num,doc_num,iob2_tag,iob2_chunk
2,SOCCER,NN,I-NP,O,0,0,O,B-NP
3,-,:,O,O,0,0,O,O
4,JAPAN,NNP,I-NP,I-LOC,0,0,B-LOC,B-NP
5,GET,VB,I-VP,O,0,0,O,B-VP
6,LUCKY,NNP,I-NP,O,0,0,O,B-NP


In [85]:
test_iob_tagged_df_cleaned.drop(['chunk','tag'], axis = 1, inplace = True)
test_iob_tagged_df_cleaned.rename(columns = {'iob2_tag':'tag',
                                              'iob2_chunk':'chunk'}, inplace = True)

In [86]:
test_iob_tagged_df_cleaned_reduced = reduce_sentence_length_by_word(test_iob_tagged_df_cleaned,
                                                                     max_words,'sentence_num')
test_iob_tagged_df_cleaned_reduced.shape

(26456, 6)

In [87]:
len(test_iob_tagged_df_cleaned_reduced.doc_num.unique()), \
len(test_iob_tagged_df_cleaned_reduced.sentence_num.unique()), \
test_iob_tagged_df_cleaned_reduced.shape[0]

(231, 2884, 26456)

In [88]:
test_tag_counts = collections.Counter(test_iob_tagged_df_cleaned_reduced["tag"])
test_tag_counts

Counter({'O': 21261,
         'B-LOC': 1149,
         'B-PER': 1001,
         'I-PER': 688,
         'I-LOC': 170,
         'B-MISC': 396,
         'I-MISC': 137,
         'B-ORG': 1184,
         'I-ORG': 470})

In [89]:
test_getter = SentenceGetter(test_iob_tagged_df_cleaned_reduced)
test_sentences = test_getter.sentences

In [90]:
X_test = [[vocab_to_int[w[0]] if w[0] in vocab_to_int.keys() else vocab_to_int['<UNK>'] for w in s]\
                                                            for s in test_sentences]
X_test = pad_sequences(maxlen = max_len, sequences = X_test, padding  = 'post', value = vocab_to_int['<PAD>'])
len(X_test[0])

25

In [91]:
y_test = [[tag2idx[w[3]] for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen = max_len, sequences = y_test, padding  = 'post', value = tag2idx['O'])

In [92]:
y_test = [to_categorical(i, num_classes = n_tags) for i in y_test]

In [93]:
test_pred = model.predict(X_test, verbose=1)



In [94]:
test_pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)

In [95]:
print("F1-score: {:.1%}".format(f1_score(test_labels, test_pred_labels)))
print(classification_report(test_labels, test_pred_labels))

F1-score: 75.8%
           precision    recall  f1-score   support

      ORG       0.73      0.79      0.76      1184
      PER       0.77      0.71      0.74      1001
      LOC       0.88      0.82      0.85      1149
     MISC       0.53      0.64      0.58       396

micro avg       0.75      0.76      0.76      3730
macro avg       0.76      0.76      0.76      3730

