In [1]:
import pandas as pd
import numpy as np
np.random.seed(777)
import re
from sklearn.metrics import classification_report

import tensorflow as tf
from nerutils import NERTextPrepper

In [2]:
#Loading data
data= pd.read_csv("../data/ner_dataset.zip",encoding="latin1")

In [3]:
#Looking for null values
print(data.isnull().sum())

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64


In [4]:
# Convert the 'Word' and 'Tag' columns
# to numeric representation and create other
# data we will need for the model.
ner = NERTextPrepper(data)

In [5]:
data.head(8)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,london,NNP,B-geo
7,Sentence: 1,to,TO,O


In [6]:
ner.num_words

31817

In [7]:
print("Unique Words in corpus:",ner.num_words)
print("Unique Tag in corpus:",ner.num_tags)

Unique Words in corpus: 31817
Unique Tag in corpus: 13


In [8]:
# Save word and tag dictionaries to use in deployment
pd.DataFrame({'words': ner.word2idx.keys(), 'values': ner.word2idx.values()}).to_csv('word2dict2.csv')
pd.DataFrame({'tag': ner.tag2idx.keys(), 'values': ner.tag2idx.values()}).to_csv('tag2dict2.csv')

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(ner.X, ner.y, test_size=0.1, random_state=1)

In [10]:
history = ner.fit(x_train, y_train)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1590850   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 13)           2613      
 ibuted)                                                         
                                                             

In [11]:
ner.evaluate(x_test, y_test)

['I-org', 'B-tim', 'B-geo', 'I-tim', 'O', 'B-misc', 'I-gpe', 'B-gpe', 'B-org', 'I-misc', 'I-per', 'I-geo', 'B-per']
              precision    recall  f1-score   support

       I-org       0.73      0.63      0.68      1629
       B-tim       0.91      0.88      0.89      2114
       B-geo       0.84      0.88      0.86      3765
       I-tim       0.82      0.71      0.76       684
           O       0.99      1.00      0.99    223746
      B-misc       0.54      0.14      0.22        96
       I-gpe       0.93      0.52      0.67        25
       B-gpe       0.95      0.94      0.95      1604
       B-org       0.75      0.63      0.69      1948
      I-misc       0.25      0.02      0.04        83
       I-per       0.84      0.86      0.85      1695
       I-geo       0.77      0.75      0.76       760
       B-per       0.83      0.81      0.82      1651

    accuracy                           0.98    239800
   macro avg       0.78      0.67      0.71    239800
weighted avg      

In [12]:
ner.model.save('ner-lstm-no-trailing-punct-misc')

INFO:tensorflow:Assets written to: ner-lstm-no-trailing-punct-misc/assets
