In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape, Concatenate, Activation
from tensorflow.keras.utils import plot_model
from transformers import TFAutoModel
from tensorflow.keras import backend as K
from focal_loss import sparse_categorical_focal_loss
from transformers import AutoModel
from tensorflow.keras.layers import concatenate
from keras_contrib.layers import CRF
# import tensorflow_hub as hub
# import tensorflow_text as text
import pythainlp
import spacy_thai
from nltk.tokenize import RegexpTokenizer
import re
import string
import os
from string import punctuation

In [2]:
def read_raw_text(filename):
    
    with open(filename, 'r', encoding = 'utf-8') as file:
        
        document = file.read()
        
    return document

def read_ann_file(PATH, filename): #filename e.g. 01_nut.a/xxaa.ann
    
    PATH = PATH
    
    document = read_raw_text(PATH + filename[:-4] + '.txt')
    df = pd.read_csv(PATH + filename, sep='^([^\s]*)\s', engine='python', header=None).drop(0, axis=1)
    
    
    
    token_df = df[df[1].str.contains('T')]
    
    list_tokens = []
    
    seek = 0
    
    for index, row in token_df.iterrows():
        
        text = re.findall('\t.*', row[2])[0][1:]
        entityLabel, start, end = re.findall('.*\t', row[2])[0][:-1].split(' ')
        start, end = int(start), int(end)
        
        if seek == start:
            res = [document[start:end], start, end, entityLabel]
            list_tokens.append(res)
            
        else:
#             print(seek, start)
            res = [document[seek:start], seek, start, 'O']
            list_tokens.append(res)
            
            res = [document[start:end], start, end, entityLabel]
            list_tokens.append(res)
            
        seek = end
    
    
    result_text = ''
    
    for t, start, end, ent in list_tokens:
        text = f'[{ent}]{t}[/{ent}]'
        result_text += text
        
    
    return result_text, list_tokens

def tokenize(text):
    
    nlp = spacy_thai.load()
    pattern = r'\[(.*?)\](.*?)\[\/(.*?)\]'
    tokenizer = RegexpTokenizer(pattern)
    
    
    text = re.sub(r'([ก-๏a-zA-Z\(\)\.\s0-9\-]*)(?=\[\w+\])', r'[O]\1[/O]', text)
    text = re.sub(r'([ก-๏a-zA-Z\(\)\.\s0-9\-]+)$', r'[O]\1[/O]', text)
    text = re.sub(r'\[O\](\s)*?\[\/O\]', '', text)
    t = tokenizer.tokenize(text)
    
    result = []
    text_list_ = []
    
    for i in t:
        
            if i[0] == i[2]:
                doc = pythainlp.syllable_tokenize(i[1])
                token_texts = []

                # doc = nlp('สวัสดีค้าบ ท่านผู้เจริญ')
                for token in doc:
                    token_texts.append(token)
#                     if token.whitespace_:  # filter out empty strings
#                         token_texts.append(token.whitespace_)
                

                if i[0] == 'O' :
                    for r in range(len(token_texts)):
                        result.append((token_texts[r],  i[0]))
                  # words.append(r)
                else:
                    for r in range(len(token_texts)):
        
                        if r == 0:
                            result.append((token_texts[r], 'B-' + i[0]))

                        else:
                            result.append((token_texts[r], 'I-' + i[0]))

    text_list_.append(result)

    words = []
    tags = []
    original_text = []
    poss = []
    contain_digit = []
    contain_punc = []
    contain_vowel = []

    thai_vowel = 'ะาิีุุึืโเแัำไใฤๅฦ'

    def check_condition(condition):

        if condition:
            return 'True'
        else:
            return 'False'

    for text in text_list_:
        w = []
        t = []
        o = ''
        p = []
        digit = []
        punc = []
        vowel = []
        for word in text:
            w.append(word[0])
            t.append(word[1])
    #         p.append(pythainlp.tag.pos_tag(word[0]))
            o += word[0]
            digit.append(check_condition(any(char.isdigit() for char in word[0])))
            punc.append(check_condition(any(p in word[0] for p in punctuation)))
            vowel.append(check_condition(any(p in word[0] for p in thai_vowel)))

        words.append(w)
        tags.append(t)
        contain_digit.append(digit)
        contain_punc.append(punc)
        contain_vowel.append(vowel)
    #     poss.append(p)
        original_text.append(o)

        
#     dff = pd.DataFrame({'original_text' : original_text,
#                         'words' : words,
#     #                     'pos' : poss,
#                         'contain_digit' : contain_digit,
#                         'contain_punc' : contain_punc,
#                         'contain_vowel' : contain_vowel,
#                         'tags' : tags})

    
        
    return words, tags, original_text, contain_digit, contain_punc, contain_vowel

def read_all_file(PATH):
    
    PATH = PATH
    assignee_folder_list = os.listdir(PATH)[3:3+15]

    result = {'original_text' : [],
              'words' : [],
              'tags' : [],
              'contain_digit' : [],
              'contain_punc' : [],
              'contain_vowel' : []}
    for assignee_folder in assignee_folder_list:
        text_folder_list = sorted(os.listdir(PATH + assignee_folder))
        text_folder_list = [i for i in text_folder_list if i[-3:] in ['ann', 'txt']]
        text_folder_list = set(map(lambda x : x[:-4], text_folder_list))
        
        
        for text_folder in text_folder_list:
            
            filename = assignee_folder + '/' + text_folder + '.ann'
            
            try:
                text, list_tokens = read_ann_file(PATH, filename)
                words, tags, original_text, contain_digit, contain_punc, contain_vowel = tokenize(text)
                result['original_text'].append(original_text)
                result['words'].append(words)
                result['tags'].append(tags)
                result['contain_digit'].append(contain_digit)
                result['contain_punc'].append(contain_punc)
                result['contain_vowel'].append(contain_vowel)
            except:
                print(filename)
                
    df = pd.DataFrame(result)
    
    return df

In [3]:
def return_train_test(df):
    
    df['pos'] = df['words'].apply(lambda x : [i[1] for i in pythainlp.tag.pos_tag(x)])
    
    max_len = max(df['words'].apply(lambda x: len(x)))
    
    train, test = train_test_split(df, random_state = 42, test_size = 0.2)
    
    word_set = sorted(set([i for sentence in train['words'] for i in sentence]))
    pos_set = sorted(set([i for pos in train['pos'] for i in pos]))
    tag_set = sorted(set([i for tag in train['tags'] for i in tag]))
     
    word2idx = dict([(v, k) for k, v in enumerate(word_set)])
    pos2idx = dict([(v, k) for k, v in enumerate(pos_set)])
    tag2idx = dict([(v, k) for k, v in enumerate(tag_set)])
    digit2idx = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    punc2idx = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    vowel2idx = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    
    word2idx['<UNK>'] = len(word2idx)
    word2idx['<PAD>'] = len(word2idx)
    pos2idx['<UNK>'] = len(pos2idx)
    pos2idx['<PAD>'] = len(pos2idx)
    tag2idx['<PAD>'] = len(tag2idx)
    
    train['words_idx'] = train['words'].apply(lambda x: [word2idx[i] for i in x])
    train['pos_idx'] = train['pos'].apply(lambda x: [pos2idx[i] for i in x])
    train['tags_idx'] = train['tags'].apply(lambda x: [tag2idx[i] for i in x])
    train['contain_digit_idx'] = train['contain_digit'].apply(lambda x: [digit2idx[i] for i in x])
    train['contain_punc_idx'] = train['contain_punc'].apply(lambda x: [punc2idx[i] for i in x])
    train['contain_vowel_idx'] = train['contain_vowel'].apply(lambda x: [vowel2idx[i] for i in x])
    
    test_sent = []
    test_pos = []
    test_tag = []
    
    for sent in test['words']:
        t = []
        for i in sent:
            try:
                t.append(word2idx[i])
            except:
                t.append(word2idx['<UNK>'])
                
        test_sent.append(t)
                
    for sent in test['pos']:
        t = []
        for i in sent:
            try:
                t.append(pos2idx[i])
            except:
                t.append(pos2idx['<UNK>'])
        
        test_pos.append(t)
                
    for sent in test['tags']:
        t = []
        for i in sent:
            
            t.append(tag2idx[i])
            
                
        test_tag.append(t)
        
    test['words_idx'] = test_sent
    test['pos_idx'] = test_pos
    test['tags_idx'] = test_tag
    test['contain_digit_idx'] = test['contain_digit'].apply(lambda x: [digit2idx[i] for i in x])
    test['contain_punc_idx'] = test['contain_punc'].apply(lambda x: [punc2idx[i] for i in x])
    test['contain_vowel_idx'] = test['contain_vowel'].apply(lambda x: [vowel2idx[i] for i in x])
    
    mapping = {'tok2idx' : word2idx,
               'pos2idx' : pos2idx,
               'tag2idx' : tag2idx}
    
    return train, test, mapping, max_len
    

In [15]:
# df_q = read_all_file(PATH = 'data/csd_rel_data_annotated/')
# df_p = read_all_file(PATH = 'data/csd_rel_data2_annotated/')

# for i in df_q.columns:
#     df_q[i] = df_q[i].apply(lambda x: x[0])
#     df_p[i] = df_p[i].apply(lambda x: x[0])

# df_p['pos'] = df_p['words'].apply(lambda x : [i[1] for i in pythainlp.tag.pos_tag(x)])
# df_q['pos'] = df_q['words'].apply(lambda x : [i[1] for i in pythainlp.tag.pos_tag(x)])

    
df_Coraline = pd.read_csv('[NER]Coraline_annotation_prepared_df.csv').drop(columns = 'Unnamed: 0')
for i in ['words', 'contain_digit', 'contain_punc', 'contain_vowel', 'tags', 'pos']:
    df_Coraline[i] = df_Coraline[i].str.strip('[]').str.split(', ').apply(lambda x: [i[1:-1] for i in x])


    
# df = pd.concat([df_Coraline, df], ignore_index = True)

# df.head()
# pd.concat([df_q, df_p, df_Coraline], ignore_index = True)
df = pd.concat([df_q, df_p, df_Coraline], ignore_index = True)
df

Unnamed: 0,original_text,words,tags,contain_digit,contain_punc,contain_vowel,pos
0,วันนี้ (24 พ.ค.2564) พ.ต.อ.เอกภพ ตันประยูร ผกก...,"[วัน, นี้, , (, 24, , พ.ค., 2564, ), , พ.ต....","[O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE, O...","[False, False, False, False, True, False, Fals...","[False, False, False, True, False, False, True...","[True, True, False, False, False, False, False...","[NCMN, DDAC, PUNC, PUNC, DCNM, PUNC, CMTR, NCN..."
1,เมื่อวันที่ 19 ม.ค.พล.ท.บุญยืน อินกว่าง แม่ทัพ...,"[เมื่อ, วัน, ที่, , 19, , ม.ค., พล.ท., บุญ, ...","[O, O, O, O, B-DATE, I-DATE, I-DATE, B-PERSON,...","[False, False, False, False, True, False, Fals...","[False, False, False, False, False, False, Tru...","[True, True, True, False, False, False, False,...","[JSBR, NCMN, PREL, PUNC, DCNM, PUNC, CMTR, NCM..."
2,เมื่อวันที่ 9 ม.ค. พล.ต.ต.เอกราช ลิ้มสังกาศ9 ม...,"[เมื่อ, วัน, ที่, , 9, , ม.ค., , พล.ต.ต., เ...","[O, O, O, O, O, O, O, O, B-PERSON, I-PERSON, I...","[False, False, False, False, True, False, Fals...","[False, False, False, False, False, False, Tru...","[True, True, True, False, False, False, False,...","[JSBR, NCMN, PREL, PUNC, DCNM, PUNC, CMTR, PUN..."
3,ผู้สื่อข่าวรายงานว่า วันนี้ (23 พ.ย.) เมื่อเวล...,"[ผู้, สื่อ, ข่าว, ราย, งาน, ว่า, , วัน, นี้, ...","[O, O, O, O, O, O, O, O, O, O, O, B-DATE, I-DA...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...","[False, True, True, True, True, True, False, T...","[PPRS, VACT, NCMN, CNIT, NCMN, JSBR, PUNC, NCM..."
4,เมื่อวันที่ 8 กันยายน 2564 เมื่อเวลา 12.30 น.พ...,"[เมื่อ, วัน, ที่, , 8, , กัน, ยา, ยน, , 256...","[O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE, I...","[False, False, False, False, True, False, Fals...","[False, False, False, False, False, False, Fal...","[True, True, True, False, False, False, True, ...","[JSBR, NCMN, PREL, PUNC, DCNM, PUNC, ADVN, ADV..."
...,...,...,...,...,...,...,...
1945,จากสอบถาม นายพูนศักดิ์ ปทุมสิทธิ์ อายุ 24 ปี น...,"[จาก, สอบ, ถาม, , นาย, พูน, ศักดิ์, , ปทุม, ...","[O, O, O, O, B-PERSON, I-PERSON, I-PERSON, I-P...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...","[True, False, True, False, True, False, True, ...","[RPRE, NCMN, VACT, PUNC, NTTL, NPRP, NPRP, PUN..."
1946,2 หนุ่มใหญ่ ตั้งวงกินเหล้า ทะเลาะเรื่องบัตรคนจ...,"[2, , หนุ่ม, ใหญ่, , ตั้ง, วง, กิน, เหล้า, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[True, False, False, False, False, False, Fals...","[False, False, False, False, False, False, Fal...","[False, False, True, True, False, True, False,...","[NLBL, PUNC, NCMN, VATT, PUNC, VACT, NCMN, VAC..."
1947,ที่เกิดเหตุพบศพ นายเสถียร ธรรมมาทอง อายุ 49 ปี...,"[ที่, เกิด, เหตุ, พบ, ศพ, , นาย, เสถียร, , ธ...","[O, O, O, O, O, O, B-PERSON, I-PERSON, I-PERSO...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...","[True, True, True, False, False, False, True, ...","[PREL, VSTA, NCMN, VSTA, NPRP, PUNC, NTTL, NPR..."
1948,เมื่อเวลา 01.30 น. วันที่ 24 พ.ย. 64 ร.ต.อ.ธรา...,"[เมื่อ, เว, ลา, , 01.30, , น., , วัน, ที่, ...","[O, O, O, O, B-TIME, I-TIME, I-TIME, O, O, O, ...","[False, False, False, False, True, False, Fals...","[False, False, False, False, True, False, True...","[True, True, True, False, False, False, False,...","[JSBR, NCMN, NCMN, PUNC, DCNM, PUNC, CMTR, PUN..."


In [16]:
train, test, mapping, max_len = return_train_test(df)

train['padded_words_idx'] = list(pad_sequences(train['words_idx'], maxlen = max_len, padding = 'post', value = mapping['tok2idx']['<PAD>']))
train['padded_pos_idx'] = list(pad_sequences(train['pos_idx'], maxlen = max_len, padding = 'post', value = mapping['pos2idx']['<PAD>']))
train['padded_tags_idx'] = list(pad_sequences(train['tags_idx'], maxlen = max_len, padding = 'post', value = mapping['tag2idx']['<PAD>']))
train['padded_contain_digit_idx'] = list(pad_sequences(train['contain_digit_idx'], maxlen = max_len, padding = 'post', value = 2))
train['padded_contain_punc_idx'] = list(pad_sequences(train['contain_punc_idx'], maxlen = max_len, padding = 'post', value = 2))
train['padded_contain_vowel_idx'] = list(pad_sequences(train['contain_vowel_idx'], maxlen = max_len, padding = 'post', value = 2))

test['padded_words_idx'] = list(pad_sequences(test['words_idx'], maxlen = max_len, padding = 'post', value = mapping['tok2idx']['<PAD>']))
test['padded_pos_idx'] = list(pad_sequences(test['pos_idx'], maxlen = max_len, padding = 'post', value = mapping['pos2idx']['<PAD>']))
test['padded_tags_idx'] = list(pad_sequences(test['tags_idx'], maxlen = max_len, padding = 'post', value = mapping['tag2idx']['<PAD>']))
test['padded_contain_digit_idx'] = list(pad_sequences(test['contain_digit_idx'], maxlen = max_len, padding = 'post', value = 2))
test['padded_contain_punc_idx'] = list(pad_sequences(test['contain_punc_idx'], maxlen = max_len, padding = 'post', value = 2))
test['padded_contain_vowel_idx'] = list(pad_sequences(test['contain_vowel_idx'], maxlen = max_len, padding = 'post', value = 2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['words_idx'] = train['words'].apply(lambda x: [word2idx[i] for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['pos_idx'] = train['pos'].apply(lambda x: [pos2idx[i] for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tags_idx'] = train['tags'].apply(lambda x: [tag

In [17]:
train.iloc[0]['padded_words_idx'][-1]

6032

In [19]:
# !pip install sklearn_crfsuite
from tensorflow.keras import backend as K
from focal_loss import sparse_categorical_focal_loss
from transformers import AutoModel
from tensorflow.keras.layers import concatenate
from keras_contrib.layers import CRF

def focal_loss(y_true, y_pred):
    
    # Loss for imbalanced dataset --> weight more for minor class, weight less for major class
    
    class_weight = [10,10,10,15,15,
                    10,10,10,10,15,
                    10,10,10,15,15,
                    10,10,10,10,10,
                    1, 0.01 
                    ]
    loss = sparse_categorical_focal_loss(y_true, y_pred, gamma=2, class_weight = class_weight)
                                                                       
    return loss

def train_model(X, y, model):
    loss = list()
    
    # Add class weight
    
    
#     for i in range(150):
        # fit model for one epoch on this sequence
    hist = model.fit(X, y, batch_size=64,  verbose=1, epochs=60, validation_split=0.2 )
    loss.append(hist.history['loss'][0])
    return model, loss

label = mapping['tag2idx']

input_dim_long = 6032 + 1

input_len_long = len(train['padded_words_idx'].iloc[0])

n_tags = len(label)
output_dim = 8

model_words = Input(shape = (input_len_long,))
emb_words = Embedding(input_dim=input_dim_long, output_dim=output_dim)(model_words)
# output_words = Reshape(target_shape=(output_dim, input_len_long))(emb_words)

model_pos = Input(shape = (input_len_long,))
emb_pos = Embedding(input_dim=input_dim_long, output_dim=output_dim)(model_pos)
# output_pos = Reshape(target_shape=(output_dim, input_len_long))(emb_pos)
model_digit = Input(shape = (input_len_long,))
emb_digit = Embedding(input_dim=input_dim_long, output_dim=output_dim)(model_digit)

model_punc = Input(shape = (input_len_long,))
emb_punc = Embedding(input_dim=input_dim_long, output_dim=output_dim)(model_punc)

model_vowel = Input(shape = (input_len_long,))
emb_vowel = Embedding(input_dim=input_dim_long, output_dim=output_dim)(model_vowel)


input_model = [model_words, model_pos, model_digit, model_punc, model_vowel]

output_embeddings = [emb_words, emb_pos, emb_digit, emb_punc, emb_vowel]

output_model = Concatenate()(output_embeddings)
output_model = Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))(output_model)
output_model = TimeDistributed(Dense(n_tags, activation="softmax"))(output_model)

model = Model(inputs = input_model, outputs = output_model)
    
model.compile(loss= [focal_loss], 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.01, epsilon=1e-08), 
              metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 1668)]       0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 1668)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1668)]       0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 1668)]       0           []                               
                                                                                            

In [20]:
input_dim = 6032 + 1
output_dim = 8
input_length = max_len
n_tags = len(label)

X_tr_words = []
for i in train['padded_words_idx']:
    X_tr_words.append(i)
X_tr_words = np.array(X_tr_words)

X_tr_pos = []
for i in train['padded_pos_idx']:
    X_tr_pos.append(i)
X_tr_pos = np.array(X_tr_pos)

X_tr_digit = []
for i in train['padded_contain_digit_idx']:
    X_tr_digit.append(i)
X_tr_digit = np.array(X_tr_digit)

X_tr_punc = []
for i in train['padded_contain_punc_idx']:
    X_tr_punc.append(i)
X_tr_punc = np.array(X_tr_punc)

X_tr_vowel = []
for i in train['padded_contain_vowel_idx']:
    X_tr_vowel.append(i)
X_tr_vowel = np.array(X_tr_vowel)

y_train = [i for i in train['padded_tags_idx']]
y_train = np.array(y_train)

model = train_model([X_tr_words, X_tr_pos, X_tr_digit, X_tr_punc, X_tr_vowel], y_train, model)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [34]:
X_tr_words

array([[5176, 5273, 3807, ..., 6032, 6032, 6032],
       [5176, 5273, 3807, ..., 6032, 6032, 6032],
       [1805, 3352, 3877, ..., 6032, 6032, 6032],
       ...,
       [3924, 2688,    0, ..., 6032, 6032, 6032],
       [3606, 2868, 4219, ..., 6032, 6032, 6032],
       [3924, 2498,    0, ..., 6032, 6032, 6032]])

In [22]:
X_te_words = []
for i in test['padded_words_idx']:
    X_te_words.append(i)
X_te_words = np.array(X_te_words)

X_te_pos = []
for i in test['padded_pos_idx']:
    X_te_pos.append(i)
X_te_pos = np.array(X_te_pos)

X_te_digit = []
for i in test['padded_contain_digit_idx']:
    X_te_digit.append(i)
X_te_digit = np.array(X_te_digit)

X_te_punc = []
for i in test['padded_contain_punc_idx']:
    X_te_punc.append(i)
X_te_punc = np.array(X_te_punc)

X_te_vowel = []
for i in test['padded_contain_vowel_idx']:
    X_te_vowel.append(i)
X_te_vowel = np.array(X_te_vowel)

y_pred = model[0].predict([X_te_words, X_te_pos, X_te_digit, X_te_punc, X_te_vowel])
y_pred = np.argmax(y_pred, axis = 2)
y_test = []
for i in test['padded_tags_idx']:
    y_test.append(i)
y_test = np.array(y_test)

from sklearn.metrics import classification_report


print(classification_report(y_test.reshape(y_pred.shape[0]*y_pred.shape[1]), 
                            y_pred.reshape(y_pred.shape[0]*y_pred.shape[1]),
                           target_names = label.keys())
     )

                precision    recall  f1-score   support

       B-COLOR       0.62      0.86      0.72       139
        B-DATE       0.74      0.92      0.82       425
    B-LOCATION       0.66      0.87      0.75      1979
          B-LP       0.52      0.65      0.58        85
      B-OBJECT       0.32      0.49      0.39       222
B-ORGANIZATION       0.52      0.85      0.64       806
      B-PERSON       0.79      0.93      0.85      1647
        B-TIME       0.77      0.97      0.86       711
     B-VEHICLE       0.53      0.90      0.67       432
      B-WEAPON       0.33      0.90      0.48       376
       I-COLOR       0.48      0.94      0.64       186
        I-DATE       0.73      0.96      0.83      1394
    I-LOCATION       0.67      0.89      0.76      5968
          I-LP       0.51      0.94      0.66       525
      I-OBJECT       0.25      0.54      0.34       694
I-ORGANIZATION       0.49      0.82      0.62      3339
      I-PERSON       0.67      0.95      0.79  

In [23]:
model[0].save('NER_model_v2_26_1_2022.h5')

In [28]:
with open(f'mapping/{}.pickle', 'wb') as dict_:
    pickle.dump(idx2word, dict_)

{'tok2idx': {' ': 0,
  '  ': 1,
  '   ': 2,
  '"': 3,
  '"#': 4,
  '"ชน': 5,
  '"ตก': 6,
  '"พบศพ"': 7,
  '#': 8,
  '%': 9,
  '(': 10,
  '(ขอ': 11,
  '(คน': 12,
  '(จ)': 13,
  '(จต': 14,
  '(ช': 15,
  '(ตก': 16,
  '(บ': 17,
  '(บช': 18,
  '(ผ': 19,
  '(ผบ': 20,
  '(พฐ.)': 21,
  '(พฐ.)\u200b': 22,
  '(พบ': 23,
  '(ม': 24,
  '(ศ': 25,
  '(ศป': 26,
  '(ศปอ': 27,
  '(ศพฐ.)': 28,
  '(ส': 29,
  '(ส)': 30,
  '(สบ': 31,
  '(สพฐ.ตร)': 32,
  '(์': 33,
  ')': 34,
  '),': 35,
  ')\u200b': 36,
  '+': 37,
  ',': 38,
  '-': 39,
  '.': 40,
  '.(': 41,
  '.(นบ)': 42,
  '.)': 43,
  '.,': 44,
  '.กก': 45,
  '.ค': 46,
  '.ช': 47,
  '.ตก': 48,
  '.ตน': 49,
  '.ตอ': 50,
  '.ท': 51,
  '.บ': 52,
  '.ป': 53,
  '.ป,': 54,
  '.ปค': 55,
  '.ปคม': 56,
  '.ปป': 57,
  '.ปพ': 58,
  '.ปอ': 59,
  '.ฝ': 60,
  '.ย': 61,
  '.ร': 62,
  '.ษ.': 63,
  '.สส': 64,
  '.อ': 65,
  '.อคฝ.': 66,
  '.อม': 67,
  '.\u200b': 68,
  '/': 69,
  '0': 70,
  '0.1': 71,
  '0.25': 72,
  '0.51': 73,
  '00.00': 74,
  '00.08': 75,
  '00.10': 76,
 

In [33]:

mapping['max_len'] = max_len
for i in mapping.keys():
    with open(f'mapping/NER/{i}.pickle', 'wb') as dict_:
        pickle.dump(mapping[i], dict_)