In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras_contrib.layers.crf import CRF, crf_loss
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from glove import Corpus, Glove
import glob
import numpy as np
import pandas as pd
import random
import pickle
import operator
#import spacy #spacy multilanguage
#nlp = spacy.load('xx')
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage() #pre-trained model NER

Using TensorFlow backend.


ModuleNotFoundError: No module named 'glove'

In [3]:
def readline(filename):
    """
        read file
        return
        format [ ['segelas', 'TYPES'], ['douni', 'PRODUCT], ['parfum', 'PRODUCT'], ['collection', 'PRODUCT'], ['fusion', 'PRODUCT']]
    """
    sentences = []
    sentence = []
    with open(filename, encoding='utf-8') as f:
        for i in f:
            if(len(i[:-1]) < 2) or i.startswith("-DOCSTART-"):
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
                continue
            splits = i[:-1].split(",")
            sentence.append([splits[0], splits[-1]])
        
        if len(sentence) > 0:
            sentences.append(sentence)
            sentence = []

    return sentences

In [4]:
def createMatrices(sentences, word2Idx, label2Idx, char2Idx):
    """
    Get the matrices for every dataset. Different with normal NER, we do not use casing-features here, because the input 
    from ASR would be case insensitive
        sentences(list) : list of list of words from the input.
        word2Idx(dict)   : Word and its index
        label2Idx(dict)   : Label and its index
        char2Idx(dict)   : Characters and its index
        
    Output:
        dataset(matrix)   :Matrix containing all of the features.
    """
    unknownIdx = word2Idx["UNK"]
    paddingIdx = word2Idx["PAD"]
    
    dataset = []
    
    word_count = 0
    unknown_word_count = 0
    
    for sentence in sentences:
        word_indices = []
        char_indices = []
        label_indices = []
        for word, char, label in sentence:
            word_count += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = unknownIdx
                unknown_word_count += 1
            
            charIdx = []
            for x in char:
                try:
                    charIdx.append(char2Idx[x])
                except KeyError:
                    charIdx.append(char2Idx["UNK"])
            
            word_indices.append(wordIdx)
            char_indices.append(charIdx)
            label_indices.append(label2Idx[label])
            
        dataset.append([word_indices, char_indices, label_indices])
    
    return dataset

In [5]:
def addCharInformation(sentences):
    """
    Split the word in the sentences from 1 dataset into list of characters:
        ex: [["S","e","l","a","m","a","t"],["P","a","g","i"]]
    Input:
        sentences(list)   : The dataset
    """    
    for i, sentence in enumerate(sentences):
        for j, data in enumerate(sentence):
            chars = [i for i in data[0]]
            sentences[i][j] = [data[0], chars, data[1]] ## data[0] is token, chars is chars of token, data[1] is label/entity
    return sentences

In [6]:
def padding(sentences):
    """
    Because the batches already with same length, we are not padding it anymore. We padding the characters instead
        ex: [["S","e","l","a","m","a","t"],["P","a","g","i","<PAD>","<PAD>","<PAD>"]]
        
    Input:
        sentence(list)   :The dataset
    """
    
    maxwordlength = 10 #10 character max in 1 word
    for i,sentence in enumerate(sentences):
        sentences[i][1] = pad_sequences(sentences[i][1], maxwordlength, padding='pre',truncating='post')
    return sentences

In [7]:
def createBatches(data):
    """
    Create the batches for train_on_batch keras.
    Input:
        data(list)   :The dataset
    """
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)  ## kata
    batches = []
    batch_len = []
    z = 0
    
    for i in l:
        for batch in data:
            if(len(batch[0]) == i):
                batches.append(batch)
                z+=1
        batch_len.append(z)
    return batches, batch_len

In [8]:
def iterate_minibatches(dataset,batch_len):
    """
    Generator for dataset to be yielded during training
    Input:
        Dataset(list) : The dataset
        batch_len(list) :Start and end batch_len
    Output:
        Array of [labels, tokens, casing, char] features
    """    
    start = 0
    for i in batch_len:
        tokens = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,ch,l = dt
            l = np.expand_dims(l, -1)
            tokens.append(t)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels), np.asarray(tokens),np.asarray(char) 

In [9]:
## Predict data
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i, data in enumerate(dataset):
        tokens, char, labels = data
        tokens = np.asarray([tokens])
        char = np.asarray([char])
        
        pred = model.predict([tokens, char], verbose = False)[0]
        pred = pred.argmax(axis = -1) #Predict the classes
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    b.update(i+1)
    
    return predLabels, correctLabels

# 1. Open the file needed and Convert the word into list of chars

In [10]:
trainSentences = readline("../data/clean/dataset.csv")
#trainSentences = readline("../data/clean/data_full_product.csv")
trainSentences = addCharInformation(trainSentences)

# 2. Get the Label from the dataset, and convert it to index

In [11]:
label_set = set()
words = {}
for sentence in trainSentences:
    for token, char, label in sentence:
        label_set.add(label)
        words[token.lower()] = True

label2idx = {v:k for k,v in enumerate(sorted(label_set))}
idx2label = {v:k for k,v in label2idx.items()}

# 3. GloVe Word Embedding

In [12]:
pickle_word2idx_glove = open("../data/pickle_file/word2idx_glove.pkl","rb")
word2idx_glove = pickle.load(pickle_word2idx_glove)
word2Idx = {i[0]:i[1] for i in word2idx_glove}

In [13]:
embedding_file_path = '../model/glove_embedding.txt'

glove = Glove()
glove = glove.load(embedding_file_path)
wordEmbeddings = glove.word_vectors
n_dim = glove.word_vectors.shape[1]

word2Idx.update({'PAD': len(word2Idx)})
wordEmbeddings = np.concatenate((wordEmbeddings, np.zeros((1, n_dim))))
word2Idx.update({'UNK': len(word2Idx)})
wordEmbeddings = np.concatenate((wordEmbeddings, np.random.uniform(-0.25*10**-8, 0.25*10**-8,n_dim).reshape((1,n_dim))))

In [14]:
wordEmbeddings[:5].shape

(5, 150)

# 4. Get list of characters and its index

In [15]:
char2Idx = {"PAD":0, "UNK":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)

# 5. Create Dataset

In [16]:
train_set = createMatrices(trainSentences, 
               word2Idx = word2Idx, 
               label2Idx = label2idx,
               char2Idx = char2Idx)

In [17]:
train_set = padding(train_set)

In [18]:
X_train, X_test = train_test_split(train_set, train_size = 0.8)

In [19]:
train_batch, train_batch_len = createBatches(X_train)
test_batch, test_batch_len = createBatches(X_test)

In [20]:
print(train_batch_len)
print(test_batch_len)

[594, 32611, 54776, 80868, 138532, 202246, 241493, 253787, 257868, 260895]
[150, 8133, 13740, 20284, 34739, 50606, 60395, 63476, 64479, 65224]


In [21]:
batch_size = 256
train_batch_size = []
test_batch_size = []

## Change training batch size
for val in range(batch_size, max(train_batch_len), batch_size):
    train_batch_size.append(val)
for val in train_batch_len:
    train_batch_size.append(val)
    
train_batch_size.sort()

## Change testing batch size
for val in range(batch_size, max(test_batch_len), batch_size):
    test_batch_size.append(val)
for val in test_batch_len:
    test_batch_size.append(val)

test_batch_size.sort()

# print
print(train_batch_size)
print(test_batch_size)

[256, 512, 594, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936, 8192, 8448, 8704, 8960, 9216, 9472, 9728, 9984, 10240, 10496, 10752, 11008, 11264, 11520, 11776, 12032, 12288, 12544, 12800, 13056, 13312, 13568, 13824, 14080, 14336, 14592, 14848, 15104, 15360, 15616, 15872, 16128, 16384, 16640, 16896, 17152, 17408, 17664, 17920, 18176, 18432, 18688, 18944, 19200, 19456, 19712, 19968, 20224, 20480, 20736, 20992, 21248, 21504, 21760, 22016, 22272, 22528, 22784, 23040, 23296, 23552, 23808, 24064, 24320, 24576, 24832, 25088, 25344, 25600, 25856, 26112, 26368, 26624, 26880, 27136, 27392, 27648, 27904, 28160, 28416, 28672, 28928, 29184, 29440, 29696, 29952, 30208, 30464, 30720, 30976, 31232, 31488, 31744, 32000, 32256, 32512, 32611, 32768, 33024, 33280, 33536, 33792, 34048, 34304, 34560, 34816, 35072, 35328, 35584, 35840, 36096, 36352, 36608, 36864, 37120, 37376, 37632, 

# Model

In [22]:
from keras.layers import Input, Embedding, Dropout, TimeDistributed, Conv1D, Dense, multiply, CuDNNLSTM, GlobalAveragePooling1D
from keras.layers import Concatenate, MaxPooling1D, Flatten, Bidirectional, LSTM
from keras.initializers import RandomUniform
from keras.models import Model, load_model
from keras.utils import plot_model,Progbar
from keras_multi_head import MultiHeadAttention

In [23]:
maxwordlength = 10

In [24]:
words_input = Input(shape = (None, ), dtype = np.int32, name = 'words_input')
words = Embedding(input_dim = wordEmbeddings.shape[0],
                  output_dim = wordEmbeddings.shape[1],
                  weights = [wordEmbeddings],
                  trainable = False)(words_input)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [25]:
char_input = Input(shape = (None, maxwordlength, ), name = 'char_input')
x = TimeDistributed(Embedding(input_dim = len(char2Idx),
                      output_dim = 32,
                      embeddings_initializer = RandomUniform()))(char_input)

conv1 = TimeDistributed(Conv1D(filters = 5,
                           kernel_size = 1,
                           padding = 'same',
                           activation = 'tanh',
                           strides = 1))(x)
maxpool1 = TimeDistributed(GlobalAveragePooling1D())(conv1)

conv2 = TimeDistributed(Conv1D(filters = 5,
                           kernel_size = 2,
                           padding = 'same',
                           activation = 'tanh',
                           strides = 2))(x)
maxpool2 = TimeDistributed(GlobalAveragePooling1D())(conv2)

conv3 = TimeDistributed(Conv1D(filters = 5,
                           kernel_size = 3,
                           padding = 'same',
                           activation = 'tanh',
                           strides = 3))(x)
maxpool3 = TimeDistributed(GlobalAveragePooling1D())(conv3)

conv4 = TimeDistributed(Conv1D(filters = 5,
                           kernel_size = 3,
                           padding = 'same',
                           activation = 'tanh',
                           strides = 4))(x)
maxpool4 = TimeDistributed(GlobalAveragePooling1D())(conv4)

conv5 = TimeDistributed(Conv1D(filters = 5,
                           kernel_size = 4,
                           padding = 'same',
                           activation = 'tanh',
                           strides = 5))(x)
maxpool5 = TimeDistributed(GlobalAveragePooling1D())(conv5)

char_lstm = TimeDistributed(Bidirectional(CuDNNLSTM(128)))(x)

concat = Concatenate()([char_lstm, maxpool1, maxpool2, maxpool3, maxpool4, maxpool5])
char = TimeDistributed(Flatten())(concat)

In [26]:
output = Concatenate()([words, char])
output = Bidirectional(CuDNNLSTM(128,return_sequences = True))(output)
attn = MultiHeadAttention(head_num=256, name = 'Multi-head')(output)
output = CuDNNLSTM(128,return_sequences=True)(attn)
output = CRF(len(label2idx), sparse_target = True)(output)

In [27]:
model = Model(inputs = [words_input, char_input], outputs = [output])
model.compile(loss = crf_loss, optimizer = 'nadam', metrics=['acc'])
print(model.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 10)     0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 10, 32) 3040        char_input[0][0]                 
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, None, 10, 5)  165         time_distributed_1[0][0]         
__________________________________________________________________________________________________
time_distributed_4 (TimeDistrib (None, None, 5, 5)   325         time_distributed_1[0][0]        

# Training Process

In [28]:
epochs = 30
for epoch in range(epochs):
    print("Epoch %d/%d" %(epoch, epochs))
    a = Progbar(len(train_batch_size))
    for i, batch in enumerate(iterate_minibatches(train_batch, train_batch_size)):
        labels, tokens, char = batch
        model.train_on_batch([tokens, char], labels)
        a.update(i)
    a.update(i+1)
    print(" ")

Epoch 0/30

 
Epoch 1/30
 
Epoch 2/30
 
Epoch 3/30
 
Epoch 4/30
 
Epoch 5/30
 
Epoch 6/30
 
Epoch 7/30
 
Epoch 8/30
 
Epoch 9/30
 
Epoch 10/30
 
Epoch 11/30
 
Epoch 12/30
 
Epoch 13/30
 
Epoch 14/30
 
Epoch 15/30
 
Epoch 16/30
 
Epoch 17/30
 
Epoch 18/30
 
Epoch 19/30
 
Epoch 20/30
 
Epoch 21/30
 
Epoch 22/30
 
Epoch 23/30
 
Epoch 24/30
 
Epoch 25/30
 
Epoch 26/30
 
Epoch 27/30
 
Epoch 28/30
 
Epoch 29/30
 


In [29]:
model.save("../model/ner_glove.h5")

In [42]:
import pickle
pickle.dump(word2Idx,open("../data/pickle_file/word2idx.pkl", "wb"))
pickle.dump(char2Idx, open("../data/pickle_file/char2idx.pkl","wb"))
pickle.dump(label2idx, open("../data/pickle_file/label2idx.pkl","wb"))

In [31]:
model = load_model("../model/ner_glove.h5",
                   custom_objects ={'CRF':CRF,
                                   'crf_loss':crf_loss,
                                   'MultiHeadAttention':MultiHeadAttention})

In [32]:
def checkaccuracy(predict_label,correct_label):
    count = 0
    for i in zip(predict_label, correct_label):
        if(i[0] != i[1]):
            count += 1
    if(count == 0):
        return 1
    else:
        return 0

In [33]:
# Performance on dev dataset
predLabels, correctLabels = tag_dataset(test_batch)

# Calculate Performance of model on data
counter = 0
for i in range(len(predLabels)):
    counter += checkaccuracy(predLabels[i], correctLabels[i])
print("Accuracy:", counter/len(predLabels))

Accuracy: 0.9740586287256224


# Test The Model

In [40]:
def tokenizing(text):
    doc = nlp(text,disable=['ner','tagger','parser'])
    doc = [i.text for i in doc]
    return doc
# tokenized = [tokenizing(i) for i in data]

In [35]:
def convert_to_batch_dataset(text, word2Idx, char2Idx):
    """
        Args:
            text (str): Input sentence
        Return:
            A list consist of.
            [0]: A list of Token's index in the sentence
            [1]: A list of Casing condition of token in the sentence
            [2]: A list of list of token's Characters pattern
    """
    word_indices = []
    char_indices = []
    doc = nlp(text)
    for token in doc:
        i = token.text
        try:
            word_indices.append(word2Idx[i])
        except KeyError:
            word_indices.append(word2Idx["UNK"])
        
        tok = []
        for j in i:
            try:
                tok.append(char2Idx[j])
            except KeyError:
                tok.append(char2Idx["UNK"])
        char_indices.append(pad_sequences([tok], maxlen=maxwordlength)[0])
        
    return [word_indices, char_indices]

In [36]:
def tag_dataset_dev_set(dataset):
    predLabels = []
    b = Progbar(len(dataset))
    for i, data in enumerate(dataset):
        tokens, char= data
        tokens = np.asarray([tokens])
        char = np.asarray([char])
        
        pred = model.predict([tokens, char], verbose = False)[0]
        predLabels.append([np.argmax(i) for i in pred])
        b.update(i)
    b.update(i+1)
    
    return predLabels

In [37]:
def replace_oov_with_unk(sentence_test):
    """
        Replace out-of-vocabulary with unknown (UNK)
    """
    ## Bikin training vocabulary
    trainVocab = []
    for sentence in trainSentences:
        for word, char, label in sentence:
            trainVocab.append(word)
    
    trainVocab = set(trainVocab)
    trainVocab.remove('UNK') ## Remove unknown
    
    ## Split sentence test jadi token
    token = sentence_test.split()
    new_token = []
    for i in token:
        if i not in trainVocab:
            new_token.append('UNK')
        else :
            new_token.append(i)
            
    ## Join all tokens jadi sentence
    new_sentence_test = ' '.join(new_token)
    
    return new_sentence_test

In [None]:
import pickle
pickle_word2idx = open("../data/pickle_file/word2idx.pkl","rb")
pickle_char2idx = open("../data/pickle_file/char2idx.pkl","rb")
pickle_label2idx = open("../data/pickle_file/label2idx.pkl","rb")
word2Idx = pickle.load(pickle_word2idx)
char2Idx = pickle.load(pickle_char2idx)
label2Idx = pickle.load(pickle_label2idx)
idx2label = {v:k for k,v in label2Idx.items()}

In [45]:
tmp = "Abese Kopi Susu Dua Saset"
#tmp = replace_oov_with_unk(tmp)
tmp = convert_to_batch_dataset(tmp, word2Idx, char2Idx)
ans = tag_dataset_dev_set([tmp])[0]
ans = [idx2label[i] for i in ans]
print(ans)

['BRAND', 'DESCRIPTION', 'DESCRIPTION', 'NUMERIC', 'UOM']


In [None]:
#Bolpoin MX Dua Rebu tiga buah
#Ji Sam Su Dua Belas tiga batang