In [2]:
from flair.embeddings import FlairEmbeddings
from flair.data import Sentence

# init embedding
flair_embedding_forward = FlairEmbeddings('news-forward-fast')

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
flair_embedding_forward.embed(sentence)

[Sentence: "The grass is green ."   [− Tokens: 5]]

In [7]:
for token in sentence:
    print(token)
    print(token.embedding)
    print(token.embedding.shape)
    print(token.embedding.size()[0])

Token: 1 The
tensor([ 2.1388e-03, -1.0227e-06, -5.7348e-03,  ..., -1.6456e-09,
        -7.8441e-05,  1.6318e-02], device='cuda:0')
torch.Size([1024])
1024
Token: 2 grass
tensor([-8.7855e-04, -4.2676e-05,  2.4843e-02,  ..., -1.9470e-06,
         6.3773e-04,  5.7139e-03], device='cuda:0')
torch.Size([1024])
1024
Token: 3 is
tensor([ 1.8402e-03, -2.0841e-04,  2.9775e-02,  ..., -9.3616e-07,
         1.6768e-05,  2.9047e-04], device='cuda:0')
torch.Size([1024])
1024
Token: 4 green
tensor([-4.1963e-04, -1.5563e-05,  4.5961e-03,  ..., -4.5412e-08,
        -1.1528e-04,  3.4503e-02], device='cuda:0')
torch.Size([1024])
1024
Token: 5 .
tensor([ 8.2723e-04, -2.9691e-06,  4.9718e-03,  ..., -1.0784e-08,
        -4.4279e-06,  2.0673e-03], device='cuda:0')
torch.Size([1024])
1024


In [8]:
import pandas as pd
import numpy as np 

In [9]:
data = pd.read_csv('../Data/ner_dataset.csv',encoding='latin1')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [10]:
data.drop(columns=['POS'],inplace=True)
data = data.fillna(method='ffill')

In [11]:
class sentence_getter():
    def __init__(self,data):
        agg_function = lambda s:[(w,t) for w,t in zip(s['Word'].values.tolist(),
                                                      s['Tag'].values.tolist())]
        self.grouped_data = data.groupby('Sentence #').apply(agg_function)
        self.sentences = [s for s in self.grouped_data]

In [12]:
getter = sentence_getter(data)
sentences = getter.sentences
sentences[2]

[('Helicopter', 'O'),
 ('gunships', 'O'),
 ('Saturday', 'B-tim'),
 ('pounded', 'O'),
 ('militant', 'O'),
 ('hideouts', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Orakzai', 'B-geo'),
 ('tribal', 'O'),
 ('region', 'O'),
 (',', 'O'),
 ('where', 'O'),
 ('many', 'O'),
 ('Taliban', 'B-org'),
 ('militants', 'O'),
 ('are', 'O'),
 ('believed', 'O'),
 ('to', 'O'),
 ('have', 'O'),
 ('fled', 'O'),
 ('to', 'O'),
 ('avoid', 'O'),
 ('an', 'O'),
 ('earlier', 'O'),
 ('military', 'O'),
 ('offensive', 'O'),
 ('in', 'O'),
 ('nearby', 'O'),
 ('South', 'B-geo'),
 ('Waziristan', 'I-geo'),
 ('.', 'O')]

In [14]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
word_index = {w : i + 1 for i ,w in enumerate(words)}

In [16]:
tags = list(set(data["Tag"].values))
num_tags = len(tags)
tag_index =  {t : i for i ,t in enumerate(tags)}

In [17]:
tag_index

{'I-per': 0,
 'B-eve': 1,
 'I-art': 2,
 'B-org': 3,
 'B-gpe': 4,
 'O': 5,
 'I-org': 6,
 'B-art': 7,
 'B-nat': 8,
 'I-eve': 9,
 'B-per': 10,
 'B-tim': 11,
 'I-tim': 12,
 'I-geo': 13,
 'B-geo': 14,
 'I-nat': 15,
 'I-gpe': 16}

In [19]:
import tensorflow as tf

In [20]:
X = [[word_index[w[0]] for w in s] for s in sentences]
X = tf.keras.preprocessing.sequence.pad_sequences(maxlen=50,sequences=X,padding='post', value = n_words - 1)

In [24]:
max_len = 50

In [27]:
X_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(X,
maxlen=max_len,
value = n_words - 1,
padding='post')

In [30]:
vocabulary_size = len(word_index)+1
EMBEDDING_DIM = 1024

embedding_matrix =np.zeros((vocabulary_size,EMBEDDING_DIM))

for word,i in word_index.items():
    try:
        word_embedding = Sentence(word)
        flair_embedding_forward.embed(word_embedding)
        embedding_vector = word_embedding[0].embedding.cpu().detach().numpy()
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [32]:
y = [[tag_index[w[1]] for w in s] for s in sentences]
y = tf.keras.preprocessing.sequence.pad_sequences(maxlen=50,sequences=y,padding='post',value=tag_index['O'])
y = [tf.keras.utils.to_categorical(i,num_classes=num_tags) for i in y]

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_sequences_padded,y,test_size = 0.1,random_state = 42)

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocabulary_size,EMBEDDING_DIM,weights = [embedding_matrix],trainable=False)

In [None]:

model = tf.keras.Sequential([
    # Embedding Layer 
    embedding_layer,
    # Bidiretional LSTM for learning Long term dependencies
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,return_sequences=True)),
    # Dense Layer with RELU
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(embedding_dim,activation='relu')),
    # Ouput layer with 16 units
    tf.keras.layers.Dense(num_tags,activation='softmax')
])

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping