In [1]:
import pandas as pd
import numpy as np
np.random.seed(777)
import re
from sklearn.metrics import classification_report

import tensorflow as tf
from nerutils import NERTextPrepper

In [2]:
#Loading data
data= pd.read_csv("../data/ner_dataset.zip",encoding="latin1")

In [3]:
#Looking for null values
print(data.isnull().sum())

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64


In [4]:
# Convert the 'Word' and 'Tag' columns
# to numeric representation and create other
# data we will need for the model.
ner = NERTextPrepper(data)

In [5]:
data.head(8)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,london,NNP,B-geo
7,Sentence: 1,to,TO,O


In [6]:
ner.num_words

31820

In [7]:
print("Unique Words in corpus:",ner.num_words)
print("Unique Tag in corpus:",ner.num_tags)

Unique Words in corpus: 31820
Unique Tag in corpus: 13


In [8]:
# Save word and tag dictionaries to use in deployment
pd.DataFrame({'words': ner.word2idx.keys(), 'values': ner.word2idx.values()}).to_csv('word2dict2.csv')
pd.DataFrame({'tag': ner.tag2idx.keys(), 'values': ner.tag2idx.values()}).to_csv('tag2dict2.csv')

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(ner.X, ner.y, test_size=0.1, random_state=1)

In [10]:
history = ner.fit(x_train, y_train)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1591000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 13)           2613      
 ibuted)                                                         
                                                             

KeyboardInterrupt: 

In [None]:
ner.evaluate(x_test, y_test)

['I-gpe', 'I-geo', 'I-per', 'O', 'B-geo', 'B-per', 'B-org', 'B-tim', 'I-org', 'B-misc', 'I-tim', 'I-misc', 'B-gpe']
              precision    recall  f1-score   support

       I-gpe       0.00      0.00      0.00        25
       I-geo       0.87      0.19      0.32       760
       I-per       0.66      0.65      0.65      1695
           O       0.98      1.00      0.99    223746
       B-geo       0.71      0.80      0.75      3765
       B-per       0.66      0.59      0.62      1651
       B-org       0.64      0.31      0.42      1948
       B-tim       0.90      0.67      0.77      2114
       I-org       0.37      0.31      0.33      1629
      B-misc       0.00      0.00      0.00        96
       I-tim       0.68      0.05      0.09       684
      I-misc       0.00      0.00      0.00        83
       B-gpe       0.93      0.76      0.84      1604

    accuracy                           0.97    239800
   macro avg       0.57      0.41      0.44    239800
weighted avg      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
ner.model.save('ner-lstm-no-trailing-punct-misc')

INFO:tensorflow:Assets written to: ner-lstm-no-trailing-punct-misc/assets
